Esempio n. 1
0
    def testAddDocNormalCase(self):
        print "SuperDocGeneratorTest: testing SuperDocGenerator.add_doc normal case..."
        test_meta = SupremeCourtOpinionMetadata()
        test_meta.case_num = "No. 99"
        test_doc = Document(test_meta, OPINION_TEXT, TEST_PICKLE_PATH)

        self.assertEqual(len(self.test_generator.doc_list), 5)
        self.test_generator.add_doc(test_doc)
        self.assertEqual(len(self.test_generator.doc_list), 6)
        self.assertEqual(self.test_generator.doc_list[5], test_doc)
Esempio n. 2
0
def _parse_html_file(path_info):
    '''Open the html file with the given path then parse the file.'''
    file, file_dir, url = path_info
    with open(file_dir + file, 'r') as html:
        try:
            document = Document(url, file_dir, html).export()
            return document
        except Exception as ex:
            message = 'Problem parsing file ' + file
            log_unsuccessful('parse')(message=message, exception=ex)
Esempio n. 3
0
def generateTrainFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    #------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences('abstract')
    sent_indices = range(offset, offset + len(pos_sents))
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    # Count ranker
    #count_ranker = Ranker(all_sentences, tfidf=False)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices,
                                           sec_indices):
        feature_string = '+1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    #------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices,
                                           sec_indices):
        feature_string = '-1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    print "File processed to create feature vectors for training."
Esempio n. 4
0
    def new(self, domain, name):
        '''
        Create a new document

        :param domain: Domain for the document
        :param name: Unique name for the document (in the domain)
        :return: Document
        '''
        doc_id = self.__engine.create_new_doc(domain, name)
        doc = Document(self.__engine, doc_id)
        return doc
Esempio n. 5
0
    def getDocument(self, text):
        other = {}
        modeA = False
        modeW = False
        info = ""
        identifier = ""
        author = ""
        texte = ""

        st = text.split("\n")
        s = ""
        for s in st:
            if (s.startswith(".I")):
                identifier = s[3:]
                continue

            if (s.startswith(".")):
                if (modeW):
                    texte = info
                    info = ""
                    modeW = False

                if (modeA):
                    author = info
                    info = ""
                    modeA = False

            if (s.startswith(".W")):
                modeW = True
                info = s[2:]
                continue

            if (s.startswith(".A")):
                modeA = True
                info = s[2:]
                continue

            if ((modeW) or (modeA)):
                #print "add "+s
                info += " " + s

        if (modeW):
            texte = info
            info = ""
            modeW = False

        if (modeA):
            author = info
            info = ""
            modeA = False

        other["text"] = texte[4:]
        doc = Document(identifier, texte[2:], other)
        return doc
Esempio n. 6
0
    def parse(self, url, file_type, file_content):
        # Parse the file as a HTML file.
        # Reference from: https://stackoverflow.com/questions
        #   30565404/remove-all-style-scripts-and-html-tags-from-an-html-page
        text = file_content
        title = ''
        if 'html' in file_type:
            # Clean the file. Don't save HTML markup
            soup = BeautifulSoup(file_content, 'html.parser')
            # Remove all javascript and stylesheet code.
            for script in soup(["script", "style"]):
                script.extract()

            title = soup.title.string  # Get the title of this file.
            # print("The title of this file is: ", title)
            text = soup.body.get_text()  # Get the body of this file.

        lines = (line.strip() for line in text.splitlines())
        # Build a chunk of tokens.
        chunks = []
        for line in lines:
            for phrase in line.split(" "):  # Split with space.
                chunks.append(phrase.strip())
        # Drop blank lines.
        text = '\n'.join(chunk for chunk in chunks if chunk)

        # Write to a file.
        self.doc_id += 1
        filename = "Doc#" + str(self.doc_id) + '.txt'
        # Ensure the file will closed.
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(text)

        # I only give id to document I'm gonna parse.
        document = Document(url, self.doc_id, filename, file_type,
                            self.stop_words)
        document.filter()
        document.stem()
        document.collection()
        # print("There're", len(document.term), "terms in document", filename)

        if 'html' in file_type:
            document.set_title(title)

        # Duplicate Detection
        for d in self.docs:
            if self.duplicate_detection(d, document) == 1:
                # print("The content of Doc#{} is exact duplicate with Doc#{}, so, we won't parse Doc#{}."
                #       .format(document.get_id(), d.get_id(), document.get_id()))
                self.url_already_seen = self.url_already_seen.union(
                    {str(document.get_url())})
                return False
        self.docs.append(document)
        return True
Esempio n. 7
0
def spellcheck(file_name):
    print 'spellcheck'
    d = Document(file_name)
    s = SpellcheckAPI()
    tmp = d.get_text()
    if tmp:
        head, tail = os.path.split(file_name)
        out_name = 'samples/output/corrections/%s' % tail
        print(s.spellcheck(tmp))
        f = open(out_name, 'w')
        f.write(s.spellcheck(tmp))
        f.close()
Esempio n. 8
0
 def map(self, line):
     # TODO: call `self.emit(key, value)`
     instance = Document(line)
     min_dist = sys.maxsize
     key = -1
     for cluster in self.clusters:
         dist = MathUtil.compute_distance(map1=cluster.tfidf,
                                          map2=instance.tfidf)
         if dist < min_dist:
             key = cluster.uid
             min_dist = dist
     self.emit(key, line)  #instance.__str__()
Esempio n. 9
0
 def map(self, line):
     #find cluster assignment by brute force
     doc = Document(line)
     cluster_uid = None
     sqdist_to_nearest = float('inf')
     for cluster_k in self.clusters:
         sqdist_k = MathUtil.compute_distance(map1 = cluster_k.tfidf, map2 = doc.tfidf, squared=True)
         if sqdist_k <= sqdist_to_nearest:
             cluster_uid = cluster_k.uid
     #dutifully emit.
     self.emit(key = cluster_uid, value = doc)
     return
Esempio n. 10
0
def read_clustered_corpus(path):
    result = []

    for directory in os.listdir(path):
        cluster = []
        for file in os.listdir(os.path.join(path, directory)):
            text_file = read_text_file(os.path.join(path, directory, file))
            document = Document(text_file)
            cluster.append(document)
        result.append(cluster)

    return result
Esempio n. 11
0
    def get(self, domain, name):
        '''
        Retrieve a document

        :param domain: Name of the domain to get document from
        :param name: Name of the document to retrieve (within domain)
        :return: Document objects
        '''
        doc_id = self.__engine.get_document_id(domain, name)
        if doc_id is None:
            raise KeyError("Document doesn't exist: %s \ %s" % (domain, name))
        return Document(self.__engine, doc_id)
Esempio n. 12
0
    def __init_context(self):
        """
            Spidermonkey Context initialization.
        """
        document = Document(self)
        self.__dict__['__cx'] = self.__dict__['__rt'].new_context(alertlist = [])
        self.__dict__['__sl'] = []
        self.__dict__['__fl'] = [document]

        self.__init_properties(document)
        self.__init_methods()
        self.__finalize_context()
Esempio n. 13
0
        def build(self):
                self.builder = gtk.Builder()
                self.builder.add_from_file(os.path.join(self.datadir, 'ui', 'snippets.ui'))
                
                handlers_dic = {
                        'on_dialog_snippets_response': self.on_dialog_snippets_response,
                        'on_dialog_snippets_destroy': self.on_dialog_snippets_destroy,
                        'on_button_new_snippet_clicked': self.on_button_new_snippet_clicked,
                        'on_button_import_snippets_clicked': self.on_button_import_snippets_clicked,
                        'on_button_export_snippets_clicked': self.on_button_export_snippets_clicked,
                        'on_button_remove_snippet_clicked': self.on_button_remove_snippet_clicked,
                        'on_entry_tab_trigger_focus_out': self.on_entry_tab_trigger_focus_out,
                        'on_entry_tab_trigger_changed': self.on_entry_tab_trigger_changed,
                        'on_entry_accelerator_focus_out': self.on_entry_accelerator_focus_out,
                        'on_entry_accelerator_focus_in': self.on_entry_accelerator_focus_in,
                        'on_entry_accelerator_key_press': self.on_entry_accelerator_key_press,
                        'on_source_view_snippet_focus_out': self.on_source_view_snippet_focus_out,
                        'on_tree_view_snippets_row_expanded': self.on_tree_view_snippets_row_expanded,
                        'on_tree_view_snippets_key_press': self.on_tree_view_snippets_key_press}

                self.builder.connect_signals(handlers_dic)
                
                self.build_tree_view()
                self.build_model()

                image = self['image_remove']
                image.set_from_stock(gtk.STOCK_REMOVE, gtk.ICON_SIZE_SMALL_TOOLBAR)

                source_view = self['source_view_snippet']
                manager = get_language_manager()
                lang = manager.get_language('snippets')

                if lang:
                        source_view.get_buffer().set_highlight_syntax(True)
                        source_view.get_buffer().set_language(lang)
                        self.snippets_doc = Document(None, source_view)

                combo = self['combo_drop_targets']
                combo.set_text_column(0)

                entry = combo.child
                entry.connect('focus-out-event', self.on_entry_drop_targets_focus_out)
                entry.connect('drag-data-received', self.on_entry_drop_targets_drag_data_received)
                
                lst = entry.drag_dest_get_target_list()
                lst = gtk.target_list_add_uri_targets(entry.drag_dest_get_target_list(), self.TARGET_URI)
                entry.drag_dest_set_target_list(lst)
                
                self.dlg = self['dialog_snippets']
                
                if self.default_size:
                        self.dlg.set_default_size(*self.default_size)
Esempio n. 14
0
    def newDiagram(self, widget, data=None):
        newDocument = Document(self.tabsPanel.get_current_page() + 1)
        scrollArea = gtk.ScrolledWindow()
        scrollArea.set_policy(gtk.POLICY_ALWAYS, gtk.POLICY_ALWAYS)
        scrollArea.add_with_viewport(newDocument)

        n = self.tabsPanel.append_page(
            scrollArea,
            gtk.Label("Diagram %d" % (self.tabsPanel.get_n_pages() + 1)))
        scrollArea.show_all()
        self.tabsPanel.set_current_page(n)

        self.documentManager.documents.append(newDocument)
Esempio n. 15
0
 def __init__(self):
     '''
     In initialization, the class save a reference of DBlink class and two references of Document class, one for
     info of last query and one to alias. 
     '''
     self.dblink=DBlink()
     #the lastquery document must be into personal path (into HOME for Linux and MacOS, into Documents for Windows)
     self.lastquery=Document("lastquery.txt")
     #the alias document must be into class' path
     path=os.path.dirname(os.path.abspath(__file__))
     self.alias=KL_Document("alias.txt",path)
     #default table for mecordex user
     self.default_table='MEDCORDEX'
Esempio n. 16
0
 def __init__(self, vectorSize=100, windowSize=5):
     super()
     self.document = Document()
     if not Path(c.doc2VecModel).exists():
         docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(self.document.docList)]
         print(docs)
         self.model = Doc2Vec(vector_size=vectorSize, window=windowSize, min_count=5, workers=4, epochs=40,
                              alpha=0.025)
         self.model.build_vocab(docs)
         self.model.train(docs, total_examples=self.model.corpus_count, epochs=self.model.epochs)
         self.model.save("./doc2VecModel")
     else:
         self.model = Doc2Vec.load(c.doc2VecModel)
Esempio n. 17
0
def generate_raw_data(input_file, embed_map, gen_type='gold'):
    print('loading data from %s' % input_file)
    data_reader = jsonlines.open(input_file)

    raw_data = []
    for doc_data in data_reader.iter():
        doc = Document(doc_data, embed_map)
        if gen_type == 'gold':
            raw_data += doc.generate_gold_anaphor_data()
        else:
            raw_data += doc.generate_candidate_anaphor_data()

    print("---> total number of training pairs: %s" % len(raw_data))
    return raw_data
Esempio n. 18
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    #-----------------------------------------
    sents, sent_indices = getSecRankedSent(doc)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding
    # section index
    sec_indices = sent2Section(doc, sent_indices)
    summary = []
    classified = []
    sum_len = 0
    for sent, sec_idx in zip(sents, sec_indices):
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket, sent))
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
            classified.append((sent, sent_val))
    for sent, val in sorted(classified, key=itemgetter(1)):
        summary.append(sent)
        sum_len += len(sent.split(' '))
        if sum_len > 130:
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Esempio n. 19
0
 def __init__(self, dataDir, wordToIdMap, wordList):
     self.D = 0  # The number of documents
     # self.clusterNoArray = []
     self.documents = []
     with open(dataDir) as input:
         line = input.readline()
         while line:
             self.D += 1
             obj = json.loads(line)
             text = obj['textCleaned']
             document = Document(text, wordToIdMap, wordList, int(obj['tweetId']))
             self.documents.append(document)
             line = input.readline()
     print("number of documents is ", self.D)
Esempio n. 20
0
    def map(self, line):
        # Key is cluster id - clusters stored in self.clusters
        # Value is the line
        dist = float("inf")
        temp_dist = float("inf")
        doc = Document(line)
        key = doc.uid
        for c in self.clusters:
            temp_dist = MathUtil.compute_distance(doc.tfidf,c.tfidf)
            if temp_dist < dist:
                dist = temp_dist
                key = c.uid

        self.emit(str(key),str(doc))
Esempio n. 21
0
File: tests.py Progetto: kbrady/tess
def four_frames_test():
    # make directories
    original_pic_dir = 'tests/four-frames/original-pictures'
    dir_for_bigger_images = 'tests/four-frames' + os.sep + settings.images_ready_for_ocr
    if not os.path.isdir(dir_for_bigger_images):
        os.mkdir(dir_for_bigger_images)
    dir_for_hocr = 'tests/four-frames' + os.sep + settings.hocr_dir
    if not os.path.isdir(dir_for_hocr):
        os.mkdir(dir_for_hocr)
    dir_for_xml = 'tests/four-frames' + os.sep + settings.xml_dir
    if not os.path.isdir(dir_for_xml):
        os.mkdir(dir_for_xml)
    # make initial run through the images
    for filename in os.listdir(original_pic_dir):
        # resize
        full_path = original_pic_dir + os.sep + filename
        full_path_for_new_image = dir_for_bigger_images + os.sep + filename
        initial_ocr.resize_image(full_path,
                                 full_path_for_new_image,
                                 redo=True,
                                 part='digital reading')
        # run tesseract
        full_path_for_hocr = dir_for_hocr + os.sep + filename
        initial_ocr.run_tesseract_on_image(full_path_for_new_image,
                                           full_path_for_hocr,
                                           redo=True)
    # make corrections
    correct_bags = ocr_cleanup.get_correct_bags()
    word_to_doc = ocr_cleanup.make_matching_dictionary(correct_bags)
    ocr_cleanup.cleanup_hocr_files(dir_for_hocr, dir_for_xml, correct_bags,
                                   word_to_doc)
    # find differences
    for filename in os.listdir(dir_for_xml):
        full_path = dir_for_xml + os.sep + filename
        doc = Document(full_path)
        lines = [str(l).strip() for l in doc.lines if len(str(l).strip()) > 0]
        filename_with_txt_ending = filename[:-len('png.hocr')] + 'txt'
        path_to_correct_lines_file = 'tests/four-frames' + os.sep + 'limited-correct-output-text' + os.sep + filename_with_txt_ending
        with open(path_to_correct_lines_file, 'r') as infile:
            correct_lines = [line.strip() for line in infile]
        if len(lines) != len(correct_lines):
            raise Exception(
                'lines has length {0} but correct_lines has length {1} for {2}'
                .format(len(lines), len(correct_lines), filename))
        for i in range(len(lines)):
            if lines[i] != correct_lines[i]:
                raise Exception(
                    'lines[{0}] has value\n{1}\n but correct_lines[{0}] has value\n{2}\n for {3}'
                    .format(i, lines[i], correct_lines[i], filename))
    print('Four frames test passed')
Esempio n. 22
0
def get_test_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    sentences, offset = doc.all_sentences()
    ranker = TextRank(sentences)
    ranker.rank()
    num = 7
    x = 0
    samples = ''
    sent_idx = []
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        #if not validSentence(doc[idx]):
        #    continue
        #else:
        #    sent_idx.append(idx)
        #    samples += doc[idx].sentence.encode('utf-8') + '\n'
        #    num -= 1
        sent_idx.append(idx)
        samples += doc[idx].sentence.encode('utf-8') + '\n'
        num -= 1
        #---------------------------------------------------
        # Storing the sentence in the dictionary for pickling for display
        infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1)
        key = infi + "-" + str(idx)
        test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'),
                          'textrank': ranker.scores[x - 1][1],
                          'contextpre': getContext(doc, idx, -2),
                          'contextpos': getContext(doc, idx, 2)}
    writeToFile(outfile, samples, 'w')
    #ranker = Ranker(sentences, tfidf=False)
    #return ranker, sent_idx
    #-----------------------------------------
    # Calculating the sectional TF-IDF
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx, sent_idx
Esempio n. 23
0
 def __init__(self):
     super()
     self.document = Document()
     self.nodes = self.document.nodeList
     self.docs = [item.lower().split() for item in self.document.rawDocList]
     if Path('word2vecTrained.model').exists():
         self.word2Vec = Word2Vec.load('word2vecTrained.model')
     else:
         self.word2Vec = Word2Vec(self.docs,
                                  size=100,
                                  window=4,
                                  min_count=1,
                                  workers=3)
         self.word2Vec.save('word2vecTrained.model')
Esempio n. 24
0
def load_data(src_file, tgt_file):
    docs = []
    with open(src_file, 'r', encoding='utf-8') as src_reader, \
            open(tgt_file, 'r', encoding='utf-8') as tgt_reader:
        for src_line, tgt_line in zip(src_reader, tgt_reader):
            src_line = src_line.strip()
            tgt_line = tgt_line.strip()
            if src_line == "" or tgt_line == "":
                docs.append(None)
                continue
            src_sents = src_line.split('##SENT##')
            tgt_sents = tgt_line.strip().split('##SENT##')
            docs.append(Document(src_sents, tgt_sents))
    return docs
Esempio n. 25
0
    def map(self, line):
        # TODO: Your code goes here -- call `self.emit(key, value)`

        doc = Document(line)
        shortest = float('inf')
        # current cluster lable of this data point
        cur_center = 999
        for cluster in self.clusters:
            dist_temp = self.l2_norm(doc.tfidf, cluster.tfidf)

            if dist_temp < shortest:
                shortest = dist_temp
                cur_center = cluster.uid

        self.emit(str(cur_center), str(doc))
Esempio n. 26
0
def predict(test_data):
    with open('predictions.csv', 'w', encoding='utf-8') as output:
        writer = csv.writer(output)
        writer.writerow([
            'document', 'predict_class', 'predict_score', 'exp_predict_score'
        ])
        for instance in test_data.iterrows():
            doctext = instance[1]['document']
            doc = Document(doctext)
            predict_clas = max(stats.classes,
                               key=lambda c: _compute_score(doc, c))
            predict_score = _compute_score(doc, predict_clas)
            exp_predict_score = np.exp(predict_score)
            writer.writerow(
                [doctext, predict_clas, predict_score, exp_predict_score])
Esempio n. 27
0
def process_twitter_folder(folder, metadata):
    textfiles = [
        join(folder, f) for f in listdir(folder)
        if isfile(join(folder, f)) and f.endswith(".txt")
    ]
    #textfiles = textfiles[:2] #limit for quick processing if you wish, but should be ok to work with all.
    documents = []
    for tf in textfiles:
        textname = splitext(
            split(tf)[1])[0]  #extract just username from filename.
        print('Processing ' + textname)
        document = Document(textname, metadata)
        document.process_document_from_textfile(tf)
        documents.append(document)
    return documents
Esempio n. 28
0
 def __init__(self, manualFilepath=config.manualPath, k=60):
     documents = Document(manualFilepath)
     docs = documents.docList
     self.Doc = documents
     self.docTree = documents.tree
     self.nodes = documents.nodeList
     self.uniqueTerms = getAllUniqueTerms(docs)
     self.allTopics = documents.allTopics
     termDocMatrix = getTermDocMatrix(self.uniqueTerms, docs)
     u, s, vh = np.linalg.svd(termDocMatrix, full_matrices=False)
     S = np.diag(s)
     # # ersetzen durch variable K
     self.uk = u[:, :k]
     self.Sk = S[:k, :k]
     self.vhk = vh[:k, :]
Esempio n. 29
0
def load_data(path=""):
    time.sleep(0.5)
    print("start to load data from path----->", path)
    time.sleep(0.5)
    file_list = os.listdir(path)
    sentences = list()

    for i in range(len(file_list)):
        filename = file_list[i]
        current_path = os.path.join(path, filename)
        document = Document(filename=current_path)
        for sentence in document.sentence_list:
            sentences.append(sentence)

    return sentences
Esempio n. 30
0
    def search(self):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tDocument Retrieval", self.id_q)

        search_engines = self._get_search_engines()

        try:
            num = int(MyConfig.get("document_retrieval", "n_results"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            num = 10

        results = []
        for engine in search_engines:
            try:
                results += engine.search(self.query, count=num)
            except Exception as e:
                logger = logging.getLogger("qa_logger")
                logger.error("Problem with search engine.")
                logger.debug(e)
                sys.exit(1)

        doc_list = []
        # rank loops over [0..num-1]
        rank = 0
        # ignore repeated urls
        unique_urls = set()
        for resource in results:
            if resource.url in unique_urls:
                continue
            unique_urls.add(resource.url)

            # rank+1 loops over [1..num]
            # rank+1 is the relative position of the results
            doc_list.append(Document(resource, rank + 1))
            rank = (rank + 1) % num

        try:
            if MyConfig.get("persistence", "document") == "True":
                output = open("documentos.pkl", "wb")
                pickle.dump(doc_list, output, 0)
                output.close()
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))

        return doc_list