Ejemplo n.º 1
0
def main():
	# Initialization
	starttime = time.time()
	files     = ["doc%01d.txt" % r for r in range(1,4)]
	docparser = DocumentParser()
	senparser = SentenceParser()

	# Parse the input files into lists of sentences and words
	# And use TF-IDF score to rank each word in a document
	logging.debug('Parsing Documents Into Sentences')
	sentences = []  # list containing all sentences in documents
	doclength = []  # list containing number of sentences in documents
	fdict     = {}  # dictionary for tokenized words in documents
	tfdict    = {}  # dictionary for tfidf-scored (top) words in documents
	fidict    = {}  # dictionary for overlapped words in all documents

	for num_f,f in enumerate(files):
		logging.debug('Loop Over Document [ {} ]'.format(f))
		for s in docparser.parse(open(f,'r').read()):
			doclength.append(len(s))

			filtersentences=[]

			for num_s,ss in enumerate(s):
				sentences.append(ss)
				for w,newsentence in senparser.parse(ss):
					filtersentences.append(newsentence)
					wdict = senparser.count(SentenceID=num_s, DocID=num_f)
					fdict = combineCustomDict(fdict,wdict)
								
			logging.debug('Perform TF-IDF Analysis for Document [ {} ]'.format(f))
			tfdict = tfidf(filtersentences,top=100)

			fidict = intersectByKeyDict(fidict,tfdict) 
	
	# Find the top word 
	topnumber = 1
	logging.debug('Find the top {} words between all Documents'.format(topnumber))
	topkeys = topDict(fidict, top=topnumber)

	# Output the corresponding Document and Sentence Info
	ftable={}  # dictionary for hastag table
	doclength.insert(0,0)
	for tkey in topkeys:
		ftable[tkey] = {}
		#ftable[tkey]['score']= np.mean(fidict[tkey])
		ftable[tkey]['doc']  = [ files[i] for i in np.unique(fdict[tkey]['docid']).tolist() ]

		for inum,ituple in enumerate(zip(fdict[tkey]['sentenceid'],fdict[tkey]['docid'])):
			senindex=ituple[0]+reduce((lambda x,y: x+y),doclength[:(ituple[1]+1)])
			ftable.setdefault(tkey,{}).setdefault('sentences',[]).append(sentences[senindex])

	print("The Hashtag Table is\n {}".format(json.dumps(ftable,indent=2)),file=open('hashtag_table.txt','w'))

	# Post-Run Analysis
	endtime = time.time()
	logging.debug('Total Runtime: {}'.format(endtime-starttime))
Ejemplo n.º 2
0
 def run(self):
     CHARS_PER_LINE = 54
     LINES_PER_PAGE = 30
     with open(self.HASHES, 'rb') as f:
         hashes = joblib.load(f)
     document_parser = DocumentParser(hashes, CHARS_PER_LINE, LINES_PER_PAGE)
     pdf_path = re.sub('docx', 'pdf', self.doc_path)
     
     try:
         document_parser.parse_document(self.document, pdf_path)
         self.change_value.emit()
     except KeyError as e:
         self.key_exception.emit(str(e)[1])
Ejemplo n.º 3
0
    def create(cls, args):
        """ Creates IM from list of documents in `args` """

        im = cls()

        parser = DocumentParser(args)
        vocab_dict, doc_words_counts = parser.parse(args.d)
        vocab_dict['<UNK>'] = 0
        vocab_sorted = np.sort(list(vocab_dict.keys()))

        # set mappings
        # im.ids2words = np.array(vocab_sorted)
        im.ids2docs = np.array(list(doc_words_counts.keys()))
        im.docs2ids = dict([(doc, i) for i, doc in enumerate(im.ids2docs)])
        im.words2ids = dict([(word, i) for i, word in enumerate(vocab_sorted)])
        im.dft = np.empty(len(im.words2ids))
        for i, w in enumerate(vocab_sorted):
            im.dft[i] = vocab_dict[w]

        ind_col = []  #np.arange(0, len(im.ids2docs))   # documents
        ind_row = []  #np.arange(0, len(im.ids2words))  # vocab
        data = []  # list of counts of words
        # matrix[ind_row[i], ind_col[i] = data[i]

        # Create matrix vocabulary x documents
        # term frequency, document frequency weight and normalization is applied here
        # doc_words_counts ={'id': [list_of_words, list_of_counts], ..}
        for doc, (words, counts) in doc_words_counts.items():
            # doc_term_f is actually `counts` -- ?? what
            doc_doc_f = np.empty_like(counts)
            for i, (word, count) in enumerate(zip(words, counts)):
                ind_col.append(im.docs2ids[doc])
                ind_row.append(im.words2ids[word])
                doc_doc_f[i] = im.dft[im.word2id(word)]

            tf = Funcs.get_term_weight(np.array(counts, dtype=np.float32),
                                       args.d_term_weight)
            df = Funcs.get_doc_weight(doc_doc_f, im.N, args.d_df_weight)
            weight = np.multiply(tf, df, dtype=np.float64)
            normalized = Funcs.get_norm(
                (weight, args.d_vector_norm[1], args.d_vector_norm[2]),
                args.d_vector_norm[0])
            result = np.multiply(weight, normalized, dtype=np.float32)
            # resulting number after term freq, doc freq weighting and normalization
            data.extend(result)

        im.matrix = sparse.csc_matrix((np.array(data), (np.array(ind_row), np.array(ind_col))), \
                               shape=(len(im.words2ids), len(im.ids2docs)), dtype=np.float32)

        return im
Ejemplo n.º 4
0
    def run(self):
        count_docs = 0
        direct_position_map = dict()

        while not self.stop_flag.is_set():
            if (self.queue.empty()):
                self.event.wait(timeout=1)

            document = None
            try:
                document = self.queue.get(False)
            except queue.Empty:
                continue
            direct_position_map_temp = DocumentParser.compute_direct_positional_map(
                document, self.num_threads)
            aggregate_dicts(direct_position_map, direct_position_map_temp)

            count_docs += 1
            if (count_docs == self.bulk_count):
                self.direct_map_service.direct_map(
                    list(direct_position_map.values()))
                direct_position_map = dict()
                count_docs = 0

            self.event.clear()

        self.direct_map_service.direct_map(list(direct_position_map.values()))

        print('STOPPED')
Ejemplo n.º 5
0
    def on_start(self):
        log.log_info("Starting Orchestrator...")

        try:
            log.log_info("Starting IndexHandler...")
            self.index_handler = IndexHandler.start()
            log.log_info("IndexHandler started")
        except:
            log.log_error("Could not start IndexHandler")
            log.log_debug(traceback.format_exc())

        try:
            log.log_info("Starting DocumentParser...")
            self.document_parser = DocumentParser.start(self.index_handler)
            log.log_info("DocumentParser started")
        except:
            log.log_error("Could not start DocumentParser")
            log.log_debug(traceback.format_exc())

        try:
            log.log_info("Starting QueryProcessor...")
            self.query_processor = QueryProcessor.start(
                self.document_parser, self.index_handler)
            log.log_info("QueryProcessor started")
        except:
            log.log_error("Could not start QueryProcessor")
            log.log_debug(traceback.format_exc())

        log.log_info("Orchestrator started")
Ejemplo n.º 6
0
class Indexer:
    def __init__(self, mongo_db):
        self.document_parser = DocumentParser()
        self.posts_service = PostsService(mongo_db)
        self.index_service = IndexService(mongo_db)

    def index_all_current_data(self):

        posts_to_compute = self.posts_service.get_posts_cursor()

        for post_to_compute in posts_to_compute:
            aggregation = self.document_parser.parse_document(post_to_compute)
            for key, value in aggregation.items():

                values = list(value)
                self.index_service.upsert_index({
                    'word':
                    key,
                    'document_id':
                    ObjectId(post_to_compute['_id']),
                    'freq':
                    len(values),
                    'positions':
                    list(values)
                })
Ejemplo n.º 7
0
    def run(self):
        CHARS_PER_LINE = 54
        LINES_PER_PAGE = 30
        with open(self.HASHES, 'rb') as f:
            hashes = joblib.load(f)
        document_parser = DocumentParser(hashes, CHARS_PER_LINE,
                                         LINES_PER_PAGE)
        pdf_path = re.sub('docx', 'pdf', self.doc_path)
        d = str(Environment.getExternalStorageDirectory())

        dirs = os.listdir(d)
        for file in dirs:
            print(file)

        try:
            document_parser.parse_document(
                self.document, join("/storage/emulated/0/Download/", "1.pdf"))

        except KeyError as e:
            self.key_exception.emit(str(e)[1])
Ejemplo n.º 8
0
 def __init__(self, mongo_db):
     self.document_parser = DocumentParser()
     self.posts_service = PostsService(mongo_db)
     self.index_service = IndexService(mongo_db)
Ejemplo n.º 9
0
class Retrieval:
    def __init__(self, im, args):
        self.im = im
        self.args = args
        self.parser = DocumentParser(args, topic=True)
        self.change_topics()


    def change_topics(self):
        self.doc_words_counts = self.parser.parse_topics()
        self.ids2topics = np.array(list(self.doc_words_counts.keys()))
        self.topics2ids = dict(zip(self.ids2topics, np.arange(len(self.ids2topics))))


    def search(self):
        """ Run search of topics in documents

        Returns:
            (np.array(doc_names), np.array(scores)) of top 1000 matches

        """
        ind_col = []
        ind_row = []
        data = []

        for topic, (words, counts) in self.doc_words_counts.items():
            q_doc_f = np.array([self.im.dft[self.im.word2id(word)] for word in words], dtype=np.float32)
            for i, (word, count) in enumerate(zip(words, counts)):
                ind_row.append(self.topics2ids[topic])
                ind_col.append(self.im.word2id(word))
                # doc_doc_f[i] = im.dft[im.words2ids[word]]


            tf = Funcs.get_term_weight(np.array(counts, dtype=np.float32), self.args.q_term_weight)
            df = Funcs.get_doc_weight(q_doc_f, self.im.N, self.args.q_df_weight)
            weight = np.multiply(tf, df, dtype=np.float64)
            normalized = Funcs.get_norm((weight, self.args.q_vector_norm[1], self.args.q_vector_norm[2]), self.args.q_vector_norm[0])
            result = np.multiply(weight, normalized, dtype=np.float32)

            data.extend(result)

        matrix = sparse.csr_matrix((np.array(data), (np.array(ind_row), np.array(ind_col))), \
                            shape=(len(self.ids2topics), len(self.im.words2ids)), dtype=np.float32)

        scores = matrix.dot(self.im.matrix)

        # indices of top k documents, not sorted \in [0, ..., |docs|-1]
        top_k_ind = get_top_k(scores, self.args.top_k)

        # scores of top k documents, not sorted
        top_k_scores = np.take_along_axis(scores, top_k_ind, axis=1)

        # indices \in [0, ..., K-1] of sorted scores
        indices_of_sorted_scores = np.argsort(-top_k_scores, axis=1)

        # scores of top k docs, sorted
        top_k_scores = np.take_along_axis(top_k_scores, indices_of_sorted_scores, axis=1)

        # incides of top k docs, sorted \in [0, ..., |docs|-1]
        top_k_ind = np.take_along_axis(top_k_ind, indices_of_sorted_scores, axis=1)

        # top 1000 docs sorted decreasinglt by scores
        top_k_docs = self.im.ids2docs[top_k_ind]

        return top_k_docs, top_k_scores


    def perform_retrieval(self):
        """ Performs search for topics from input file given by -q argument
        and prints the result in file given by -o argument"""
        doc_names, scores = self.search()
        # 10.2452/401-AH 0 LN-20020201065 0 0 baseline
        # 1. qid 2. iter 3. docno 4. rank 5. sim 6. run_id
        lines = []
        for i, topic in enumerate(self.ids2topics):
            for rank, (docno, score) in enumerate(zip(doc_names[i], scores[i])):
                if score <= 0.0000001:
                    # score is too low
                    break
                lines.append('\t'.join([topic, '0', docno, str(rank), '{:.7f}'.format(score), self.args.r]))
        text = '\n'.join(lines)

        # note: o is argument of the program, given by assignement description
        with open(self.args.o, 'w') as f:
            f.write(text)

    def _check(self):
        print(self.topics2ids)
Ejemplo n.º 10
0
 def __init__(self, im, args):
     self.im = im
     self.args = args
     self.parser = DocumentParser(args, topic=True)
     self.change_topics()