Esempio n. 1
0
def getUnwanted(outfile):
    dir = DIR['BASE'] + "demo/"
    os.chdir(dir)
    samples = ''
    total = 0
    num = 12
    samples = ''
    for file in glob("*.xml"):
        try:
            doc = Document(file)
            sentences, offset = doc.all_sentences()
            # Ranker
            ranker = TextRank(sentences)
            ranker.rank()
            scores = sorted(ranker.scores, key=itemgetter(1))
            for x in range(num):
                idx = scores[x][0] + offset
                samples += doc[idx].sentence.encode('utf-8') + '\n'
            total += 1
            print(file + " : Done")
        except Exception as e:
            print(file + str(e))
    # for now this is the location of the file
    writeToFile(outfile, samples, 'w')
    print ("Total number of files processed successfully : " + str(total))
Esempio n. 2
0
def summarize_secitons(document, sections, coef=0.8):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)
        limit = len(sec_sentences)

        # Ranker
        ranker = SectionMMR(all_sentences)
        ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef)
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ""
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += "\n" + sent.encode("utf-8")
        logit(text)
    file = DIR["BASE"] + "data/Summary.txt"
    with open(file, "w") as sfile:
        sfile.write("\n".join(summ).encode("utf-8"))
Esempio n. 3
0
def get_pos_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    #sentences, o = doc.all_sentences()
    #ranker = Ranker(sentences, tfidf=False)
    #-----------------------------------------
    # Instead of the above, now sentences will be clubbed into sections and
    # passed to the ranker, which is to be returned
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    sent, offset = doc.section_sentences('abstract')
    sent_idx = range(offset, offset + len(sent))
    samples = '\n'.join(sent)
    writeToFile(outfile, samples, 'w')
    #return ranker, sent_idx
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------Positive---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx
Esempio n. 4
0
 def run(self,q=Query()):
     '''
     dispatch all argoments into log.txt
     '''
     log=Document("log.txt")
     story=log.get_params()
     for i in story:
         self.dispatch(story[i]+"\n")
Esempio n. 5
0
 def edit( self, text, description=None, text_format=None ):
     """Edit the News Item.
     """
     if text_format is None:
         text_format = getattr(self, 'text_format', 'structured-text')
     if description is not None:
         self.setDescription( description )
     Document.edit( self, text_format, text )
Esempio n. 6
0
 def _edit( self, text, description=None, text_format=None ):
     """
         Edit the News Item
     """
     if text_format is None:
         text_format = getattr(self, 'text_format', 'html')
     if description is not None:
         self.setDescription( description )
     Document._edit( self, text_format, text )
Esempio n. 7
0
    def __init__(self, doc, win = None, referer = None, lastModified = None, cookie = ''):
        Document.__init__(self, doc)

        self._win           = win
        self._referer       = referer
        self._lastModified  = lastModified
        self._cookie        = cookie
        self._html          = None
        self.current        = None
Esempio n. 8
0
    def edit(self, text_format, text, file='', REQUEST=None):
        """
        Edit the discussion item.
        """

        Document.edit(self, text_format, text, file)
        if REQUEST is not None:
            return self.editForm(self, REQUEST, portal_status_message= \
                                 'Discussion item changed.')
Esempio n. 9
0
def generateTrainFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    #------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences('abstract')
    sent_indices = range(offset, offset + len(pos_sents))
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    # Count ranker
    #count_ranker = Ranker(all_sentences, tfidf=False)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices,
                                           sec_indices):
        feature_string = '+1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        #feature_string += processTree(tree, count_ranker, sent_idx, True)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    #------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices,
                                           sec_indices):
        feature_string = '-1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        #feature_string += processTree(tree, count_ranker, sent_idx, True)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    print "All input files processed to create feature vectors for training."
Esempio n. 10
0
    def __init__(self, doc, win = None, referer = None, lastModified = None, cookie = ''):
        Document.__init__(self, doc)

        self._win           = win
        self._referer       = referer
        self._lastModified  = lastModified
        self._cookie        = cookie
        self._html          = None
        self._domain        = urlparse(self._win.url).hostname if self._win else ''
        self.current        = None
Esempio n. 11
0
class PubmedArticleSet(handler.ContentHandler):
    def __init__(self):
        handler.feature_external_ges = "false"
        self.docs = {}
        self.doc = None
        self.chars = ""
        
               
    def startElement(self, name, attr):
        if name == 'PubmedArticle' or name == 'PubmedBookArticle':
            self.doc = Document()            
        self.chars = ""

            
    def endElement(self, name):
        if name == 'PubmedArticle':
            self.docs[self.doc.pmid] = self.doc
        if name == 'PMID' and self.doc.pmid == None:
            self.doc.pmid = self.text()
        if name == 'ArticleTitle':
            self.doc.title = self.text()
        if name == 'AbstractText':
            if self.doc.abstract == None:
                self.doc.abstract = self.text()
            else:
                self.doc.abstract += self.text()
        if name == 'DescriptorName':
            self.doc.addMeSH(self.text())
            
    
    def characters(self, data):
        self.chars += data


    def text(self):
        return self.chars.strip().encode('ascii', 'ignore')
    

    ## Method to parse a PubmedArticleSet XML file.
    # @param location The location of the xml file to parse
    # return A PubmedArticleSet object 
    @classmethod
    def parse(self, location):
        parser = make_parser()
        parser.setFeature("http://xml.org/sax/features/external-general-entities", False)
        parser.setFeature("http://xml.org/sax/features/external-parameter-entities", False)
        handler = PubmedArticleSet()
        parser.setContentHandler(handler)
        try:
            f = open(location, 'r')
            parser.parse(f)
            f.close()
        except Exception, e:
            raise RuntimeError, "Could not parse PubmedArticleSet XML file at %s" % location
        return handler
Esempio n. 12
0
def test2():
    cnf = QConfiguration()

    cnf.get_configuration('.','exam.cfg')

    print cnf._title
    print cnf._course
    print cnf._date

    doc = Document(cnf)

    doc.save('/home/javier/tmp', 'test')
Esempio n. 13
0
    def __init__(self, doc, win=None, referer=None, lastModified=None, cookie=""):
        Document.__init__(self, doc)

        self._win = win
        self._referer = referer
        self._lastModified = lastModified
        self._cookie = cookie
        self._html = None
        self._readyState = "loading"
        self._domain = urlparse(self._win.url).hostname if self._win else ""
        self.current = None
        self.__init_personality()
Esempio n. 14
0
def get_test_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    sentences, offset = doc.all_sentences()
    ranker = TextRank(sentences)
    ranker.rank()
    num = 7
    x = 0
    samples = ''
    sent_idx = []
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        #if not validSentence(doc[idx]):
        #    continue
        #else:
        #    sent_idx.append(idx)
        #    samples += doc[idx].sentence.encode('utf-8') + '\n'
        #    num -= 1
        sent_idx.append(idx)
        samples += doc[idx].sentence.encode('utf-8') + '\n'
        num -= 1
        #---------------------------------------------------
        # Storing the sentence in the dictionary for pickling for display
        infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1)
        key = infi + "-" + str(idx)
        test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'),
                          'textrank': ranker.scores[x - 1][1],
                          'contextpre': getContext(doc, idx, -2),
                          'contextpos': getContext(doc, idx, 2)}
    writeToFile(outfile, samples, 'w')
    #ranker = Ranker(sentences, tfidf=False)
    #return ranker, sent_idx
    #-----------------------------------------
    # Calculating the sectional TF-IDF
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx, sent_idx
Esempio n. 15
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    #-----------------------------------------
    sents, sent_indices = getSecRankedSent(doc)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding
    # section index
    sec_indices = sent2Section(doc, sent_indices)
    summary = []
    classified = []
    sum_len = 0
    for sent, sec_idx in zip(sents, sec_indices):
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket, sent))
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
            classified.append((sent, sent_val))
    for sent, val in sorted(classified, key=itemgetter(1)):
        summary.append(sent)
        sum_len += len(sent.split(' '))
        if sum_len > 130:
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Esempio n. 16
0
    def createDocumentWithAbstract(self, document_id, document_year, document_title, document_author, currentFile,  bucketId):
        # Create a document
        document_path = self.root_path + "/nips" + bucketId
        document = Document(document_id, document_year, document_title, document_author, currentFile, document_path)

        # Create an Abstract
        abstract_text = self.getAbstract(document)
        abstract = Abstract(document.document_id, document.year, abstract_text)

        # Link a Document to it's abstract
        document.abstract = abstract

        # Store the Document and the abstract in memory.
        self.documents.append(document)
        self.abstracts.append(abstract)
Esempio n. 17
0
 def search(self, query):
     results = []
     query_document = Document(query)
     query_stems = query_document.get_stems()
     documents = self.corpus.get_documents()
     for doc in documents:
         document_id = doc.get_id()
         score = 0.0
         stemmed_document = doc.get_stems()
         for qstem in query_stems:
             if qstem in stemmed_document:
                 term_frequency = self.metrics.get_term_frequency(document_id, qstem)
                 score += term_frequency
         if score > 0.0:
             results.append({"id": doc.get_id(), "score": score, "text": doc.get_text()})
     return results
Esempio n. 18
0
def getAbstracts(outfile):
    dir = DIR['BASE'] + "demo/"
    os.chdir(dir)
    samples = ''
    total = 0
    for file in glob("*.xml"):
        try:
            doc = Document(file)
            sent, offset = doc.section_sentences('abstract')
            samples += '\n'.join(sent)
            print(file + " : Done")
            total += 1
        except Exception as e:
            print(file + str(e))
    # for now this is the location of the file
    writeToFile(outfile, samples, 'w')
    print ("Total number of files processed successfully : " + str(total))
Esempio n. 19
0
    def __init__(
        self, keyword, value = None, equation = '', units = None, label = None, display = None):
        Document.__init__(self)

        self.keyword      = keyword
        self.value        = value
        self.equation     = equation
        self.units        = units
        self.label        = label
        self.display      = display

        self.variables = {}
        for keyword in EquationParser()(self.equation):
            self.variables[keyword] = None

        if self.isJustValue():
            self.set(eval(self.equation))
Esempio n. 20
0
def summarize(document, all=True):
    doc = Document(document)
    sentences, offset = (doc.all_sentences() if all
                         else doc.filtered_sentences())

    # Ranker
    ranker = TextRank(sentences)
    ranker.rank()
    scores = ranker.scores

    # Selector
    summary = []
    sum_len = 0
    for x in range(num):
        idx = scores[x][0] + offset
        sent = doc[idx].sentence
        if sum_len + len(sent.split(' ')) > MAXLEN:
            break
        summary.append((sent, scores[x][1], doc.get_section_name(idx)))
        sum_len += len(sent.split(' '))
    text = ''
    logit("\nP10-1024")
    logit("\nAll Sentences" if all else "\nFiltered Sentences")
    logit("Length of summary : " + str(sum_len))
    for sent, score, section in summary:
        text += '\n' + "[" + section.encode('utf-8') + "] " + \
                sent.encode('utf-8')
                #"[" + str(score) + "] " + sent.encode('utf-8')
    logit(text)

    # Printer
    # this has to be automated
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join([sent for sent, sc, sec in summary]).
                    encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Esempio n. 21
0
 def __init__(self):
     '''
     Initialization of object Document for configuration data and all default info
     '''
     self.config=Document("config.txt")
     self.default_host='www.medcordex.eu'
     self.default_user='******'
     self.default_passwd='sagitta'
     self.default_db='medcordex'
Esempio n. 22
0
def generateTestFeatures(infile):
    doc = Document(infile)
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    all_sentences, all_offset = doc.all_sentences()
    ranker = TextRank(all_sentences)
    ranker.rank()
    num = 7
    x = 0
    test_sents = []
    sent_indices = []
    while num > 0:
        idx = ranker.scores[x][0] + all_offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            test_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
        #------------------------------------------------
        # For display and analysis
        key = fcode + '-' + str(idx)
        test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'),
                          'textrank': ranker.scores[x - 1][1],
                          'contextpre': getContext(doc, idx, -2),
                          'contextpos': getContext(doc, idx, 2)}
    #-----------------------------------------
    for sentence, sent_idx in zip(test_sents, sent_indices):
        key = fcode + '-' + str(sent_idx)
        print key
        print test_data[key]['contextpre']
        print "----Main sentence Start----"
        print test_data[key]['sentence']
        print "----Main sentence End----"
        print test_data[key]['contextpos']
        feature_string = raw_input()
        feature_string += '1'
        test_data[key]['reallbl'] = feature_string
Esempio n. 23
0
 def show_descriptions(self):
     '''
     Check the language's option and a operation's name into argoments of the current class.
     Open the file into Manual with the selected language's option, and find the operation's name.
     Dispatch the descfiption of operation's name.
     
     '''
     try:
         language=self.getLanguage(self.args)+".txt"
         path=os.path.dirname(os.path.abspath(__file__))+"/Manual/"
         #check if language exists into path
         if os.path.exists(os.path.join(path,language)): doc=Document(language,path)
         #otherwise raise exception
         else : raise Exception("No language founded with name : "+language)
         for a in self.args:
             description=str(doc.get_parameter(a))
             if description!="": self.dispatch(description)
             else: raise Exception("Wrong operation or No describe implementation for this operation")
     except Exception as e:
         self.dispatch(e)
Esempio n. 24
0
def main():
    clustered_corpus_path = 'clustered_corpus'
    clustered_corpus = read_clustered_corpus(clustered_corpus_path)
    corpus = merge_clustered_corpus_into_a_single_corpus(clustered_corpus)

    target_file_path = 'target.txt'
    text = read_text_file(target_file_path)
    document = Document(text)

    corpus = Corpus(corpus)
    clustered_corpus = ClusteredCorpus(clustered_corpus)

    candidate_to_rank_mapping = {}
    candidate_to_params_mapping = {}
    candidate_to_dfs_in_each_cluster_mapping = {}

    for candidate in document.get_candidates():
        tf = math.log(1.0 + document.get_tf_for(candidate), 10.0)
        # tf = document.get_tf_for(candidate)
        idf = math.log(1.0 + 1.0 / corpus.get_df_for(candidate), 2.0)
        cu = clustered_corpus.get_cu_for(candidate)

        rank = cu
        # rank = tf * cu
        # rank = tf * idf

        dfs_in_each_cluster = clustered_corpus.get_dfs_in_each_cluster_for(candidate)

        candidate_representative = corpus.get_representative_for(candidate)
        candidate_to_rank_mapping[candidate_representative] = rank
        candidate_to_params_mapping[candidate_representative] = (tf, idf, cu)
        candidate_to_dfs_in_each_cluster_mapping[candidate_representative] = dfs_in_each_cluster

    table = generate_table_based_on(
        candidate_to_rank_mapping,
        candidate_to_params_mapping,
        candidate_to_dfs_in_each_cluster_mapping
    )

    save_as_file(table)
    print('Done.')
Esempio n. 25
0
def read_clustered_corpus(path):
    result = []

    for directory in os.listdir(path):
        cluster = []
        for file in os.listdir(os.path.join(path, directory)):
            text_file = read_text_file(os.path.join(path, directory, file))
            document = Document(text_file)
            cluster.append(document)
        result.append(cluster)

    return result
Esempio n. 26
0
    def __init_context(self):
        """
            Spidermonkey Context initialization.
        """
        document = Document(self)
        self.__dict__['__cx'] = self.__dict__['__rt'].new_context(alertlist = [])
        self.__dict__['__sl'] = []
        self.__dict__['__fl'] = [document]

        self.__init_properties(document)
        self.__init_methods()
        self.__finalize_context()
Esempio n. 27
0
 def map(self, line):
     #find cluster assignment by brute force
     doc = Document(line)
     cluster_uid = None
     sqdist_to_nearest = float('inf')
     for cluster_k in self.clusters:
         sqdist_k = MathUtil.compute_distance(map1 = cluster_k.tfidf, map2 = doc.tfidf, squared=True)
         if sqdist_k <= sqdist_to_nearest:
             cluster_uid = cluster_k.uid
     #dutifully emit.
     self.emit(key = cluster_uid, value = doc)
     return
Esempio n. 28
0
 def _get_arxiv_publications_as_documents(self, n, keyword="data"):
     #récupère n publications de arxiv.org
     url = 'http://export.arxiv.org/api/query?search_query=all:' + keyword + '&start=0&max_results=' + str(
         n)
     data = xmltodict.parse(urllib.request.urlopen(url).read())
     pubs = []
     #si un seul document est requêté, le format est différent
     if n == 1:
         data["feed"]["entry"] = [data["feed"]["entry"]]
     for pub in data["feed"]["entry"]:
         pubs.append(Document.factory("Arxiv", pub))
     return pubs
Esempio n. 29
0
    def Initialize():
        #Creating the trie object and passing to both the functions
        T = Trie()

        #Initialising the stopwords to trie
        T = StopWord.initialize(T)

        #Initialising the data by scraping and then loading it to trie data structure
        T = Document.initializeData(T)

        #Fully loaded Trie data structure is returned
        return T
Esempio n. 30
0
    def get(self, domain, name):
        '''
        Retrieve a document

        :param domain: Name of the domain to get document from
        :param name: Name of the document to retrieve (within domain)
        :return: Document objects
        '''
        doc_id = self.__engine.get_document_id(domain, name)
        if doc_id is None:
            raise KeyError("Document doesn't exist: %s \ %s" % (domain, name))
        return Document(self.__engine, doc_id)
Esempio n. 31
0
 def map(self, line):
     # TODO: call `self.emit(key, value)`
     instance = Document(line)
     min_dist = sys.maxsize
     key = -1
     for cluster in self.clusters:
         dist = MathUtil.compute_distance(map1=cluster.tfidf,
                                          map2=instance.tfidf)
         if dist < min_dist:
             key = cluster.uid
             min_dist = dist
     self.emit(key, line)  #instance.__str__()
Esempio n. 32
0
    def TermFrequency(word):
        TF = {}
        wordDic = {}
        docData = Document.docData
        k = Document.Search(word)
        for key, val in k.items():
            wordDic[key] = len(val)
            TF[key] = len(val) / docData[key]
            if len(val) > 0:
                Ranking.docDic = Ranking.docDic + 1

        return TF
Esempio n. 33
0
def main():

    #read in training documents and documents to classify
    documentList = createDocuments(sys.argv[1])
    trainDocs = documentList[0]
    sampleDocs = documentList[1]

    #read in stopwords
    with open("stopwords.txt", "r") as f:
        stopwords = Document.tokenize(f.read())

    #classify the documents with missing authors
    attributor = Attributor(trainDocs, sampleDocs, stopwords)
    attributor.train()
    attributor.classify()

    writeup = Writeup()
    results = attributor.get_results()
    writeup.print_accuracy(sampleDocs, results)

    writeup.print_confusion_matrix(sampleDocs, results)

    print
    featureRankings = attributor.get_feature_ranking()
    for i in range(0, 20):
        print featureRankings[i][0], featureRankings[i][1]

    print
    featureFrequencies = attributor.get_feature_frequencies()
    featurePlotDataX = []
    featurePlotDataY = []
    for numFeatures in range(10, len(featureFrequencies) + 1, 10):
        newStopwords = [
            featureFrequencies[i][0] for i in range(0, numFeatures)
        ]
        newAttributor = Attributor(trainDocs, sampleDocs, newStopwords)
        newAttributor.train()
        newAttributor.classify()
        newResults = newAttributor.get_results()
        accuracy = writeup.get_accuracy(sampleDocs, newResults)
        featurePlotDataX.append(numFeatures)
        featurePlotDataY.append(accuracy)

    print "Feature curve:"
    for i in range(len(featurePlotDataX)):
        print featurePlotDataX[i], featurePlotDataY[i]

    plt.plot(featurePlotDataX, featurePlotDataY)
    plt.xlabel("Number of Features")
    plt.ylabel("Accuracy")
    plt.title("Accuracy vs. Number of Features")
    plt.axis([0, 450, -0.1, 1.1])
    plt.show()
Esempio n. 34
0
def save_doc():
	global reading_index
	if request.method == 'POST':
		# get time spent editting in seconds
		time_spent_editting = time.time() - edit_start_time

		if len(request.form['changes_dict']) == 0:
			changes = {}
		else:
			changes = eval(request.form['changes_dict'])
		button = request.form['button']

		print('changes', changes)

		# open document
		filetime = reading_times[reading_index]
		# open the already editted document if it exists
		to_save_to = editor_folder + os.sep + time_to_filename(filetime, extension='hocr')
		print('saving_to', to_save_to)
		if os.path.isfile(to_save_to):
			filepath = to_save_to
		else:
			filepath = sess.dir_name + os.sep + source_dirs[source_dir_index] + os.sep + time_to_filename(filetime, extension='hocr')
		doc = Document(filepath, output_dir=editor_folder)
		if 'seconds_spent_editting' in doc.attrs:
			time_spent_editting += eval(doc.attrs['seconds_spent_editting'])
		doc.attrs['seconds_spent_editting'] = time_spent_editting
		# record the path of the file that was editted
		if not os.path.isfile(to_save_to):
			doc.attrs['editted_from_path'] = filepath

		# get the words with the same order and filters as the page
		img_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(filetime, extension='jpg')
		img = mpimg.imread(img_path)
		all_words = word_list(doc, img.shape)

		# make changes
		for id_key in changes:
			index = int(id_key)
			all_words[index].text = changes[id_key][0]
			all_words[index].attrs['highlight'] = changes[id_key][1]
			all_words[index].attrs['editted_by_human'] = 'True'
		# save changes
		doc.save()

		#iterate index
		if button == 'Next':
			if reading_index + 1 < len(reading_times):
				reading_index += 1
				return redirect('/doc?reading_index={}'.format(reading_index))
			else:
				return redirect('/')
		else:
			if reading_index > 0:
				reading_index -= 1
				return redirect('/doc?reading_index={}'.format(reading_index))
			else:
				return redirect('/')
	# if no data was sent, go home
	return redirect('/')
Esempio n. 35
0
        def build(self):
                self.builder = gtk.Builder()
                self.builder.add_from_file(os.path.join(self.datadir, 'ui', 'snippets.ui'))
                
                handlers_dic = {
                        'on_dialog_snippets_response': self.on_dialog_snippets_response,
                        'on_dialog_snippets_destroy': self.on_dialog_snippets_destroy,
                        'on_button_new_snippet_clicked': self.on_button_new_snippet_clicked,
                        'on_button_import_snippets_clicked': self.on_button_import_snippets_clicked,
                        'on_button_export_snippets_clicked': self.on_button_export_snippets_clicked,
                        'on_button_remove_snippet_clicked': self.on_button_remove_snippet_clicked,
                        'on_entry_tab_trigger_focus_out': self.on_entry_tab_trigger_focus_out,
                        'on_entry_tab_trigger_changed': self.on_entry_tab_trigger_changed,
                        'on_entry_accelerator_focus_out': self.on_entry_accelerator_focus_out,
                        'on_entry_accelerator_focus_in': self.on_entry_accelerator_focus_in,
                        'on_entry_accelerator_key_press': self.on_entry_accelerator_key_press,
                        'on_source_view_snippet_focus_out': self.on_source_view_snippet_focus_out,
                        'on_tree_view_snippets_row_expanded': self.on_tree_view_snippets_row_expanded,
                        'on_tree_view_snippets_key_press': self.on_tree_view_snippets_key_press}

                self.builder.connect_signals(handlers_dic)
                
                self.build_tree_view()
                self.build_model()

                image = self['image_remove']
                image.set_from_stock(gtk.STOCK_REMOVE, gtk.ICON_SIZE_SMALL_TOOLBAR)

                source_view = self['source_view_snippet']
                manager = get_language_manager()
                lang = manager.get_language('snippets')

                if lang:
                        source_view.get_buffer().set_highlight_syntax(True)
                        source_view.get_buffer().set_language(lang)
                        self.snippets_doc = Document(None, source_view)

                combo = self['combo_drop_targets']
                combo.set_text_column(0)

                entry = combo.child
                entry.connect('focus-out-event', self.on_entry_drop_targets_focus_out)
                entry.connect('drag-data-received', self.on_entry_drop_targets_drag_data_received)
                
                lst = entry.drag_dest_get_target_list()
                lst = gtk.target_list_add_uri_targets(entry.drag_dest_get_target_list(), self.TARGET_URI)
                entry.drag_dest_set_target_list(lst)
                
                self.dlg = self['dialog_snippets']
                
                if self.default_size:
                        self.dlg.set_default_size(*self.default_size)
Esempio n. 36
0
 def __init__(self):
     '''
     In initialization, the class save a reference of DBlink class and two references of Document class, one for
     info of last query and one to alias. 
     '''
     self.dblink=DBlink()
     #the lastquery document must be into personal path (into HOME for Linux and MacOS, into Documents for Windows)
     self.lastquery=Document("lastquery.txt")
     #the alias document must be into class' path
     path=os.path.dirname(os.path.abspath(__file__))
     self.alias=KL_Document("alias.txt",path)
     #default table for mecordex user
     self.default_table='MEDCORDEX'
Esempio n. 37
0
 def __init__(self, vectorSize=100, windowSize=5):
     super()
     self.document = Document()
     if not Path(c.doc2VecModel).exists():
         docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(self.document.docList)]
         print(docs)
         self.model = Doc2Vec(vector_size=vectorSize, window=windowSize, min_count=5, workers=4, epochs=40,
                              alpha=0.025)
         self.model.build_vocab(docs)
         self.model.train(docs, total_examples=self.model.corpus_count, epochs=self.model.epochs)
         self.model.save("./doc2VecModel")
     else:
         self.model = Doc2Vec.load(c.doc2VecModel)
Esempio n. 38
0
    def newDiagram(self, widget, data=None):
        newDocument = Document(self.tabsPanel.get_current_page() + 1)
        scrollArea = gtk.ScrolledWindow()
        scrollArea.set_policy(gtk.POLICY_ALWAYS, gtk.POLICY_ALWAYS)
        scrollArea.add_with_viewport(newDocument)

        n = self.tabsPanel.append_page(
            scrollArea,
            gtk.Label("Diagram %d" % (self.tabsPanel.get_n_pages() + 1)))
        scrollArea.show_all()
        self.tabsPanel.set_current_page(n)

        self.documentManager.documents.append(newDocument)
Esempio n. 39
0
def get_neg_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    sentences, offset = doc.all_sentences()
    ranker = TextRank(sentences)
    ranker.rank()
    num = 5
    x = -1
    samples = ''
    sent_idx = []
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_idx.append(idx)
            samples += doc[idx].sentence.encode('utf-8') + '\n'
            num -= 1
    writeToFile(outfile, samples, 'w')
    #ranker = Ranker(sentences, tfidf=False)
    #return ranker, sent_idx
    #-----------------------------------------
    # Calculating the sectional TF-IDF
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------Negative---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx
Esempio n. 40
0
def main():
    
    #read in training documents and documents to classify
    documentList = createDocuments( sys.argv[ 1 ] )
    trainDocs = documentList[ 0 ]
    sampleDocs = documentList[ 1 ]
        
    #read in stopwords
    with open( "stopwords.txt" , "r" ) as f:
        stopwords = Document.tokenize( f.read() )
    
    #classify the documents with missing authors
    attributor = Attributor( trainDocs , sampleDocs , stopwords )
    attributor.train()
    attributor.classify()
    
    writeup = Writeup()
    results = attributor.get_results()
    writeup.print_accuracy( sampleDocs , results )
    
    writeup.print_confusion_matrix( sampleDocs , results )
    
    print
    featureRankings = attributor.get_feature_ranking()
    for i in range( 0 , 20 ):
        print featureRankings[ i ][ 0 ] , featureRankings[ i ][ 1 ]
    
    print
    featureFrequencies = attributor.get_feature_frequencies()
    featurePlotDataX = []
    featurePlotDataY = []
    for numFeatures in range(10,len(featureFrequencies)+1,10):
        newStopwords = [ featureFrequencies[ i ][ 0 ] for i in range( 0 , numFeatures ) ]
        newAttributor = Attributor( trainDocs , sampleDocs , newStopwords )
        newAttributor.train()
        newAttributor.classify()
        newResults = newAttributor.get_results()
        accuracy = writeup.get_accuracy( sampleDocs , newResults )
        featurePlotDataX.append( numFeatures )
        featurePlotDataY.append( accuracy )
        
    print "Feature curve:"
    for i in range(len(featurePlotDataX)):
        print featurePlotDataX[ i ] , featurePlotDataY[ i ]
        
    plt.plot( featurePlotDataX , featurePlotDataY )
    plt.xlabel( "Number of Features" )
    plt.ylabel( "Accuracy" )
    plt.title( "Accuracy vs. Number of Features" )
    plt.axis( [0, 450, -0.1, 1.1] )
    plt.show()
Esempio n. 41
0
def main():
    filename = input('Please input a filename:    ')
    fileA = Document(filename)
    title = fileA.generateWhole()
    wordlist = fileA.wordlist
    o = 'The time required to do top 50 using dictionary: \n'

    worddict = BasicStats.createFreqMap(wordlist)
    n = 50
    a = time.time()
    topdict = BasicStats.topN(worddict, int(n))
    b = time.time()
    o += str(b - a) + '\n'

    o += 'The time required to do top 50 using heap: \n'
    c = time.time()

    k = BasicStats.HTopNBottomN(worddict, int(n))
    d = time.time()
    o += str(d - c) + '\n'

    o += '\nMax 50\n'
    for i in range(1, 51):
        o += str(k[1][i]) + ' ' + str(k[0][i]) + '\n'

    o += '\nMin 50\n'
    for i in range(1, 51):
        o += str(k[3][i]) + ' ' + str(k[2][i]) + '\n'

    lista = [[], []]
    for i in topdict:
        lista[0] += [i]  #words
        lista[1] += [topdict[i]]  #frequency
    graph = CommandLinePlotter.Scatter2D(lista[1])
    timefile = open('Top50TIMEFILE' + '-' + filename, 'wt', encoding='UTF-8')
    for j in o:
        timefile.write(j)
    timefile.close()
Esempio n. 42
0
 def __init__(self, dataDir, wordToIdMap, wordList):
     self.D = 0  # The number of documents
     # self.clusterNoArray = []
     self.documents = []
     with open(dataDir) as input:
         line = input.readline()
         while line:
             self.D += 1
             obj = json.loads(line)
             text = obj['textCleaned']
             document = Document(text, wordToIdMap, wordList, int(obj['tweetId']))
             self.documents.append(document)
             line = input.readline()
     print("number of documents is ", self.D)
Esempio n. 43
0
    def map(self, line):
        # Key is cluster id - clusters stored in self.clusters
        # Value is the line
        dist = float("inf")
        temp_dist = float("inf")
        doc = Document(line)
        key = doc.uid
        for c in self.clusters:
            temp_dist = MathUtil.compute_distance(doc.tfidf,c.tfidf)
            if temp_dist < dist:
                dist = temp_dist
                key = c.uid

        self.emit(str(key),str(doc))
Esempio n. 44
0
File: tests.py Progetto: kbrady/tess
def four_frames_test():
    # make directories
    original_pic_dir = 'tests/four-frames/original-pictures'
    dir_for_bigger_images = 'tests/four-frames' + os.sep + settings.images_ready_for_ocr
    if not os.path.isdir(dir_for_bigger_images):
        os.mkdir(dir_for_bigger_images)
    dir_for_hocr = 'tests/four-frames' + os.sep + settings.hocr_dir
    if not os.path.isdir(dir_for_hocr):
        os.mkdir(dir_for_hocr)
    dir_for_xml = 'tests/four-frames' + os.sep + settings.xml_dir
    if not os.path.isdir(dir_for_xml):
        os.mkdir(dir_for_xml)
    # make initial run through the images
    for filename in os.listdir(original_pic_dir):
        # resize
        full_path = original_pic_dir + os.sep + filename
        full_path_for_new_image = dir_for_bigger_images + os.sep + filename
        initial_ocr.resize_image(full_path,
                                 full_path_for_new_image,
                                 redo=True,
                                 part='digital reading')
        # run tesseract
        full_path_for_hocr = dir_for_hocr + os.sep + filename
        initial_ocr.run_tesseract_on_image(full_path_for_new_image,
                                           full_path_for_hocr,
                                           redo=True)
    # make corrections
    correct_bags = ocr_cleanup.get_correct_bags()
    word_to_doc = ocr_cleanup.make_matching_dictionary(correct_bags)
    ocr_cleanup.cleanup_hocr_files(dir_for_hocr, dir_for_xml, correct_bags,
                                   word_to_doc)
    # find differences
    for filename in os.listdir(dir_for_xml):
        full_path = dir_for_xml + os.sep + filename
        doc = Document(full_path)
        lines = [str(l).strip() for l in doc.lines if len(str(l).strip()) > 0]
        filename_with_txt_ending = filename[:-len('png.hocr')] + 'txt'
        path_to_correct_lines_file = 'tests/four-frames' + os.sep + 'limited-correct-output-text' + os.sep + filename_with_txt_ending
        with open(path_to_correct_lines_file, 'r') as infile:
            correct_lines = [line.strip() for line in infile]
        if len(lines) != len(correct_lines):
            raise Exception(
                'lines has length {0} but correct_lines has length {1} for {2}'
                .format(len(lines), len(correct_lines), filename))
        for i in range(len(lines)):
            if lines[i] != correct_lines[i]:
                raise Exception(
                    'lines[{0}] has value\n{1}\n but correct_lines[{0}] has value\n{2}\n for {3}'
                    .format(i, lines[i], correct_lines[i], filename))
    print('Four frames test passed')
Esempio n. 45
0
def load_data(src_file, tgt_file):
    docs = []
    with open(src_file, 'r', encoding='utf-8') as src_reader, \
            open(tgt_file, 'r', encoding='utf-8') as tgt_reader:
        for src_line, tgt_line in zip(src_reader, tgt_reader):
            src_line = src_line.strip()
            tgt_line = tgt_line.strip()
            if src_line == "" or tgt_line == "":
                docs.append(None)
                continue
            src_sents = src_line.split('##SENT##')
            tgt_sents = tgt_line.strip().split('##SENT##')
            docs.append(Document(src_sents, tgt_sents))
    return docs
Esempio n. 46
0
 def __init__(self):
     super()
     self.document = Document()
     self.nodes = self.document.nodeList
     self.docs = [item.lower().split() for item in self.document.rawDocList]
     if Path('word2vecTrained.model').exists():
         self.word2Vec = Word2Vec.load('word2vecTrained.model')
     else:
         self.word2Vec = Word2Vec(self.docs,
                                  size=100,
                                  window=4,
                                  min_count=1,
                                  workers=3)
         self.word2Vec.save('word2vecTrained.model')
Esempio n. 47
0
 def test_zip(self):
     self.d.append({"a": ["a"]})
     self.d.append({"a": ["aa"]})
     self.d.append({"a": ["aaa"]})
     d2 = Document()
     d2.append({"b": ["b"]})
     d2.append({"b": ["bb"]})
     joined = Document.zip(self.d, d2)
     self.assertEqual(list(joined), [{"a": ["a"], "b": ["b"]}, {"a": ["aa"], "b": ["bb"]}, {"a": ["aaa"]}])
Esempio n. 48
0
 def __init__(self, manualFilepath=config.manualPath, k=60):
     documents = Document(manualFilepath)
     docs = documents.docList
     self.Doc = documents
     self.docTree = documents.tree
     self.nodes = documents.nodeList
     self.uniqueTerms = getAllUniqueTerms(docs)
     self.allTopics = documents.allTopics
     termDocMatrix = getTermDocMatrix(self.uniqueTerms, docs)
     u, s, vh = np.linalg.svd(termDocMatrix, full_matrices=False)
     S = np.diag(s)
     # # ersetzen durch variable K
     self.uk = u[:, :k]
     self.Sk = S[:k, :k]
     self.vhk = vh[:k, :]
Esempio n. 49
0
def load_data(path=""):
    time.sleep(0.5)
    print("start to load data from path----->", path)
    time.sleep(0.5)
    file_list = os.listdir(path)
    sentences = list()

    for i in range(len(file_list)):
        filename = file_list[i]
        current_path = os.path.join(path, filename)
        document = Document(filename=current_path)
        for sentence in document.sentence_list:
            sentences.append(sentence)

    return sentences
Esempio n. 50
0
    def map(self, line):
        # TODO: Your code goes here -- call `self.emit(key, value)`

        doc = Document(line)
        shortest = float('inf')
        # current cluster lable of this data point
        cur_center = 999
        for cluster in self.clusters:
            dist_temp = self.l2_norm(doc.tfidf, cluster.tfidf)

            if dist_temp < shortest:
                shortest = dist_temp
                cur_center = cluster.uid

        self.emit(str(cur_center), str(doc))
Esempio n. 51
0
def summarize_secitons(document, sections):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)

        # Ranker
        ranker = TextRank(sec_sentences)
        ranker.rank()
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ''
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += '\n' + sent.encode('utf-8')
        logit(text)
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join(summ).encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Esempio n. 52
0
def predict(test_data):
    with open('predictions.csv', 'w', encoding='utf-8') as output:
        writer = csv.writer(output)
        writer.writerow([
            'document', 'predict_class', 'predict_score', 'exp_predict_score'
        ])
        for instance in test_data.iterrows():
            doctext = instance[1]['document']
            doc = Document(doctext)
            predict_clas = max(stats.classes,
                               key=lambda c: _compute_score(doc, c))
            predict_score = _compute_score(doc, predict_clas)
            exp_predict_score = np.exp(predict_score)
            writer.writerow(
                [doctext, predict_clas, predict_score, exp_predict_score])
Esempio n. 53
0
    def search(self):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tDocument Retrieval", self.id_q)

        search_engines = self._get_search_engines()

        try:
            num = int(MyConfig.get("document_retrieval", "n_results"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            num = 10

        results = []
        for engine in search_engines:
            try:
                results += engine.search(self.query, count=num)
            except Exception as e:
                logger = logging.getLogger("qa_logger")
                logger.error("Problem with search engine.")
                logger.debug(e)
                sys.exit(1)

        doc_list = []
        # rank loops over [0..num-1]
        rank = 0
        # ignore repeated urls
        unique_urls = set()
        for resource in results:
            if resource.url in unique_urls:
                continue
            unique_urls.add(resource.url)

            # rank+1 loops over [1..num]
            # rank+1 is the relative position of the results
            doc_list.append(Document(resource, rank + 1))
            rank = (rank + 1) % num

        try:
            if MyConfig.get("persistence", "document") == "True":
                output = open("documentos.pkl", "wb")
                pickle.dump(doc_list, output, 0)
                output.close()
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))

        return doc_list
Esempio n. 54
0
 def __init__(self, docCSV):
     self.doc = Document()
     self.allHeadlines = self.doc.allTopics
     self.questions = []
     self.expectedPages = []
     self.expectedTopicHeadline = []
     with open(docCSV) as f:
         reader = csv.reader(f)
         # Skip Header
         next(reader)
         questions = []
         for row in reader:
             self.expectedTopicHeadline.append(row[0])
             self.expectedPages.append(list(map(int, row[1].split(","))))
             questions.append(list(map(str.strip, row[2:])))
         self.questions = questions
Esempio n. 55
0
 def setUp(self):
     self.test_metadata = Metadata()
     self.test_text = ("Here is some test text. Blah blah blah blah \n" +
                       "1234567890987654321 Yea Alabama Drown 'em Tide!\n")
     self.test_filename = "test_superdoc.txt"
     self.test_document = Document(self.test_metadata, self.test_text,
                                   self.test_filename)
     self.test_metadata_list = ([
         self.test_metadata, self.test_metadata, self.test_metadata
     ])
     self.test_superdoc_text = self.test_text * 3
     #print self.test_superdoc_text
     self.test_superdoc = SuperDocument(self.test_metadata_list,
                                        self.test_superdoc_text,
                                        self.test_filename)
     self.assertEqual(len(self.test_superdoc.component_metadata), 3)
Esempio n. 56
0
 def handleMetaTable(self, data):
     data.pop(0)  # 忽略表头
     data.sort(key=itemgetter(0))
     x, y = 120, 100
     hInterval = 50
     vInterval = 0
     for row in data:
         if x > 1400:
             x = 120
             y = y + vInterval + 50
             vInterval = 0
         clsitem = Document.create_clsitem(row[1], (x, y))
         w, h = clsitem.get_size()
         x = x + w + hInterval
         if vInterval < h:
             vInterval = h
Esempio n. 57
0
 def setUpClass(cls):
     logger.debug('Parsing documents.')
     cls.parser = Parser()
     setup = CompareXml()
     for file in os.listdir('data'):
         try:
             tree = cls.parser.parse_file('data/' + file)
             root = tree.getroot()
             form = cls.parser.find_tag(root, 'formular')
             form_id = str(cls.parser.get_attribute(form).get('id'))
             contract_number = cls.parser.find_tag(root, 'v_vertragsnummer')
             setup.documents[file] = Document(form_id, contract_number,
                                              root)
         except etree.ParseError as e:
             logger.error('File ' + file + ' cannot be parsed.\n' + str(e))
     setup.check_preconditions()