Esempio n. 1
0
def preprocess_job_market(in_path, out_path):
    import os
    import json
    import codecs
    job_texts = []
    for filename in glob.glob(in_path + '*.json'):
        try:
            with codecs.open(filename, encoding='utf-8') as job_file:
                content = json.load(job_file)
                job_texts.append(content.get('description', u''))
        except:
            print("===Exception reading file " + filename)
            continue
    spacy_lang = en_core_web_sm.load()
    corpus = textacy.corpus.Corpus(spacy_lang)
    corpus_text = '\n'.join(text for text in job_texts)
    corpus.add_text(corpus_text)

    res_file = out_path + 'job_market.csv'
    if not os.path.isfile(res_file):
        termList1 = term_list(
            keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30))
        termList2 = term_list(
            keyterms.sgrank(corpus[0],
                            ngrams=(1, 2),
                            normalize=u'lower',
                            window_width=100,
                            n_keyterms=70,
                            idf=None))
        termSet1 = set(termList1)
        termSet2 = set(termList2)
        diffSet = termSet1 - termSet2
        termList = termList2 + list(diffSet)
        save_terms_text(res_file, termList)
Esempio n. 2
0
def preprocess_kags(in_path, out_path):
    import os
    spacy_lang = en_core_web_sm.load()
    for kag_path in glob.glob(KAG_BASE_PATH + '/*'):
        _, kag_name = os.path.split(kag_path)
        corpus = textacy.corpus.Corpus(spacy_lang)
        texts = []
        for comp_path in glob.glob(kag_path + '/*'):
            for filename in glob.glob(comp_path + '/*.txt'):
                texts.append(open(filename, 'r').read().decode('utf-8'))
        corpus_text = '\n'.join(text for text in texts)
        corpus.add_text(corpus_text)
        #        _ , comp_file = os.path.split(comp_path)
        #        sindex = len(kag_name) + 1
        #        eindex = sindex + comp_file[sindex:].index('_')
        res_file = '{}.csv'.format(get_kag(kag_name))
        doc_idf = corpus.word_doc_freqs(lemmatize=None,
                                        weighting='idf',
                                        lowercase=True,
                                        as_strings=True)
        termList = keyterms.sgrank(corpus[0],
                                   ngrams=(1, 2, 3),
                                   normalize=u'lower',
                                   window_width=500,
                                   n_keyterms=30,
                                   idf=doc_idf)
        save_terms_csv(out_path + res_file, termList)
Esempio n. 3
0
def preprocess_ms_jobs(in_path, out_path):
    from docx import Document
    import os
    from googletrans import Translator
    spacy_lang = en_core_web_sm.load()
    _, res_name = os.path.split(out_path)
    wordDoc = Document(in_path)
    job_count = 0
    for table in wordDoc.tables:
        desc = ''
        req = ''
        for row in table.rows:
            for cell in row.cells:
                if cell.text == "Functieomschrijving":
                    desc = row.cells[1].text
                if cell.text == "Functie-eisen":
                    req = row.cells[1].text
        if desc and req:
            corpus = textacy.corpus.Corpus(spacy_lang)
            corpus_text = '\n'.join(text for text in [desc, req])
            translator = Translator()
            corpus_text_en = translator.translate(corpus_text, dest='en').text
            corpus_text_en = corpus_text_en.encode('ascii', 'ignore')
            corpus_text_en = corpus_text_en.decode('utf-8')
            corpus.add_text(corpus_text_en)
            termList = keyterms.sgrank(corpus[0],
                                       ngrams=(1, 2, 3),
                                       normalize=u'lower',
                                       n_keyterms=100)
            res_file = out_path + 'job{0}.csv'.format(job_count)
            save_terms_csv(res_file, termList)
            job_count += 1
Esempio n. 4
0
 def test_ngrams_1(self, spacy_doc):
     expected = ["friedman", "international", "beirut", "bureau", "york"]
     observed = [
         term
         for term, _ in keyterms.sgrank(spacy_doc, ngrams=1, n_keyterms=5)
     ]
     assert len(expected) == len(observed)
Esempio n. 5
0
 def test_sgrank_ngrams_1(self):
     expected = ['friedman', 'international', 'beirut', 'bureau', 'york']
     observed = [
         term for term, _ in keyterms.sgrank(
             self.spacy_doc, ngrams=1, n_keyterms=5)
     ]
     self.assertEqual(len(expected), len(observed))
Esempio n. 6
0
def test_sgrank_n_keyterms(spacy_doc):
    expected = [
        'new york times', 'new york times jerusalem bureau chief', 'friedman',
        'president george h. w. bush', 'david k. shipler'
    ]
    observed = [term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=5)]
    assert len(expected) == len(observed)
Esempio n. 7
0
def keywords():
    #print request.get_json()
    arg = request.get_json()
    doc = textacy.Doc(arg['content'],
                      metadata={'title': arg['title']},
                      lang=unicode('en_core_web_sm'))
    sgrank_keywords = dict(keyterms.sgrank(doc))
    singlerank_keywords = dict(keyterms.singlerank(doc))
    textrank_keywords = dict(keyterms.textrank(doc))
    sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items())
    textrank_keywords.update(
        (x, y * 0.05) for x, y in textrank_keywords.items())
    singlerank_keywords.update(
        (x, y * 0.05) for x, y in singlerank_keywords.items())
    keywords = res = dict(
        Counter(sgrank_keywords) + Counter(textrank_keywords) +
        Counter(singlerank_keywords))
    sorted_keywords = sorted(keywords.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    keyword_string = ""

    for i, key in enumerate(sorted_keywords):
        if (i == int(len(sorted_keywords) / 2)):
            keyword_string = keyword_string + "||"
        if (i == len(sorted_keywords) - 1
                or i == int(len(sorted_keywords) / 2) - 1):
            keyword_string = keyword_string + key[0]
        else:
            keyword_string = keyword_string + key[0] + ",,"

    return keyword_string
Esempio n. 8
0
    def key_terms(self, algorithm='sgrank', n=10):
        """
        Extract key terms from a document using `algorithm`.

        Args:
            algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name
                of algorithm to use for key term extraction
            n (int or float, optional): if int, number of top-ranked terms to return
                as keyterms; if float, must be in the open interval (0.0, 1.0),
                representing the fraction of top-ranked terms to return as keyterms

        Returns:
            list[(str, float)]: sorted list of top `n` key terms and their
                corresponding scores

        Raises:
            ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}

        .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
        .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
        .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
        """
        if algorithm == 'sgrank':
            return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n)
        elif algorithm == 'textrank':
            return keyterms.textrank(self.spacy_doc, n_keyterms=n)
        elif algorithm == 'singlerank':
            return keyterms.singlerank(self.spacy_doc, n_keyterms=n)
        else:
            raise ValueError('algorithm {} not a valid option'.format(algorithm))
Esempio n. 9
0
    def key_terms(self, algorithm='sgrank', n=10):
        """
        Extract key terms from a document using `algorithm`.

        Args:
            algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name
                of algorithm to use for key term extraction
            n (int or float, optional): if int, number of top-ranked terms to return
                as keyterms; if float, must be in the open interval (0.0, 1.0),
                representing the fraction of top-ranked terms to return as keyterms

        Returns:
            list[(str, float)]: sorted list of top `n` key terms and their
                corresponding scores

        Raises:
            ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}

        .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
        .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
        .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
        """
        if algorithm == 'sgrank':
            return keyterms.sgrank(self.spacy_doc,
                                   window_width=1500,
                                   n_keyterms=n)
        elif algorithm == 'textrank':
            return keyterms.textrank(self.spacy_doc, n_keyterms=n)
        elif algorithm == 'singlerank':
            return keyterms.singlerank(self.spacy_doc, n_keyterms=n)
        else:
            raise ValueError(
                'algorithm {} not a valid option'.format(algorithm))
Esempio n. 10
0
 def test_sgrank_norm_normalized_str(self):
     expected = [
         'New York Times', 'New York Times Jerusalem Bureau Chief', 'Friedman',
         'President George H. W. Bush', 'George Polk Award']
     observed = [
         term for term, _
         in keyterms.sgrank(self.spacy_doc, normalize=spacy_utils.normalized_str, n_keyterms=5)]
     self.assertEqual(len(expected), len(observed))
Esempio n. 11
0
def test_sgrank_norm_none(spacy_doc):
    expected = [
        'New York Times', 'New York Times Jerusalem Bureau Chief', 'Friedman',
        'President George H. W. Bush', 'George Polk Award']
    observed = [
        term for term, _
        in keyterms.sgrank(spacy_doc, normalize=None, n_keyterms=5)]
    assert len(expected) == len(observed)
Esempio n. 12
0
def test_sgrank_window_width(spacy_doc):
    expected = [
        'new york times', 'friedman', 'new york times jerusalem',
        'times jerusalem bureau', 'second pulitzer prize']
    observed = [
        term for term, _
        in keyterms.sgrank(spacy_doc, window_width=50, n_keyterms=5)]
    assert len(expected) == len(observed)
Esempio n. 13
0
def test_sgrank(spacy_doc):
    expected = [
        'new york times', 'york times jerusalem bureau chief', 'friedman',
        'president george h. w.', 'george polk award', 'pulitzer prize',
        'u.s. national book award', 'international reporting', 'beirut',
        'washington post']
    observed = [term for term, _ in keyterms.sgrank(spacy_doc)]
    assert len(expected) == len(observed)
Esempio n. 14
0
 def test_n_keyterms(self, spacy_doc):
     expected = [
         "new york times",
         "new york times jerusalem bureau chief",
         "friedman",
         "president george h. w. bush",
         "david k. shipler",
     ]
     observed = [
         term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=5)
     ]
     assert len(expected) == len(observed)
     # can't do this owing to randomness of results
     # for e, o in zip(expected, observed):
     #     asert e == o
     observed = [
         term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=0.1)
     ]
     assert len(observed) > 0
Esempio n. 15
0
def test_sgrank_ngrams_1_2_3(spacy_doc):
    expected = [
        'new york times', 'friedman', 'pulitzer prize', 'beirut',
        'international reporting'
    ]
    observed = [
        term for term, _ in keyterms.sgrank(
            spacy_doc, ngrams=(1, 2, 3), n_keyterms=5)
    ]
    assert len(expected) == len(observed)
Esempio n. 16
0
def test_sgrank_n_keyterms(spacy_doc):
    expected = [
        "new york times",
        "new york times jerusalem bureau chief",
        "friedman",
        "president george h. w. bush",
        "david k. shipler",
    ]
    observed = [term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=5)]
    assert len(expected) == len(observed)
Esempio n. 17
0
 def test_sgrank_norm_lower(self):
     expected = [
         'new york times', 'president george h. w. bush', 'friedman',
         'new york times jerusalem bureau', 'george polk award']
     observed = [
         term for term, _
         in keyterms.sgrank(self.spacy_doc, normalize='lower', n_keyterms=5)]
     self.assertEqual(len(expected), len(observed))
     for term in observed:
         self.assertEqual(term, term.lower())
Esempio n. 18
0
def test_sgrank_norm_lower(spacy_doc):
    expected = [
        'new york times', 'president george h. w. bush', 'friedman',
        'new york times jerusalem bureau', 'george polk award']
    observed = [
        term for term, _
        in keyterms.sgrank(spacy_doc, normalize='lower', n_keyterms=5)]
    assert len(expected) == len(observed)
    for term in observed:
        assert term == term.lower()
Esempio n. 19
0
 def _apply_keyterm_ranking(self, doc, params=None):
     if self.method == 'sgrank':
         keywords = textacy.keyterms.sgrank(doc, **params) \
             if params else tck.sgrank(doc)
     elif self.method == 'textrank':
         keywords = textacy.keyterms.textrank(doc, **params) \
             if params else tck.textrank(doc)
     elif self.method == 'singlerank':
         keywords = textacy.keyterms.singlerank(doc, **params) \
             if params else tck.singlerank(doc)
     return keywords
Esempio n. 20
0
def extract_keyterms(data):
    tokens = []
    doc = Doc(data, lang="en_core_web_md")
    res = keyterms.sgrank(doc, n_keyterms=100)

    for r in res:
        tokens.append(str(r[0]))

    if len(tokens) == 0:
        tokens = ["empty"]

    return tokens
Esempio n. 21
0
def test_sgrank_norm_none(spacy_doc):
    expected = [
        "New York Times",
        "New York Times Jerusalem Bureau Chief",
        "Friedman",
        "President George H. W. Bush",
        "George Polk Award",
    ]
    observed = [
        term for term, _ in keyterms.sgrank(spacy_doc, normalize=None, n_keyterms=5)
    ]
    assert len(expected) == len(observed)
Esempio n. 22
0
def test_sgrank_window_width(spacy_doc):
    expected = [
        "new york times",
        "friedman",
        "new york times jerusalem",
        "times jerusalem bureau",
        "second pulitzer prize",
    ]
    observed = [
        term for term, _ in keyterms.sgrank(spacy_doc, window_width=50, n_keyterms=5)
    ]
    assert len(expected) == len(observed)
Esempio n. 23
0
def test_sgrank_ngrams_1_2_3(spacy_doc):
    expected = [
        "new york times",
        "friedman",
        "pulitzer prize",
        "beirut",
        "international reporting",
    ]
    observed = [
        term for term, _ in keyterms.sgrank(spacy_doc, ngrams=(1, 2, 3), n_keyterms=5)
    ]
    assert len(expected) == len(observed)
Esempio n. 24
0
def test_sgrank_norm_lower(spacy_doc):
    expected = [
        "new york times",
        "president george h. w. bush",
        "friedman",
        "new york times jerusalem bureau",
        "george polk award",
    ]
    observed = [
        term for term, _ in keyterms.sgrank(spacy_doc, normalize="lower", n_keyterms=5)
    ]
    assert len(expected) == len(observed)
    for term in observed:
        assert term == term.lower()
Esempio n. 25
0
def preprocess_jobs_or_cvs(in_path, out_path):
    import os
    for filename in glob.glob(in_path + '*.json'):
        _, cv_file = os.path.split(filename)
        res_file = out_path + '{0}.csv'.format(cv_file[0:cv_file.index('.')])
        if not os.path.isfile(res_file):
            corpus = read_cv2(filename)
            termList = keyterms.sgrank(corpus[0],
                                       ngrams=(1, 2),
                                       normalize=u'lower',
                                       window_width=500,
                                       n_keyterms=30,
                                       idf=None)
            save_terms_csv(res_file, termList)
Esempio n. 26
0
def preprocess_jobs_or_cvs(in_path, out_path):
    import os
    job_count = 0  # limit number of jobs processed
    for filename in glob.glob(in_path + '*.json'):
        _, joc_file = os.path.split(filename)
        res_file = out_path + '{0}.csv'.format(joc_file[0:joc_file.index('.')])
        if not os.path.isfile(res_file):
            corpus = read_cv2(filename)
            termList = keyterms.sgrank(corpus[0],
                                       normalize=u'lower',
                                       n_keyterms=100)
            save_terms_csv(res_file, termList)
            job_count += 1
        if job_count >= 100:
            break
Esempio n. 27
0
 def test_base(self, spacy_doc):
     expected = [
         "new york times",
         "york times jerusalem bureau chief",
         "friedman",
         "president george h. w.",
         "george polk award",
         "pulitzer prize",
         "u.s. national book award",
         "international reporting",
         "beirut",
         "washington post",
     ]
     observed = [term for term, _ in keyterms.sgrank(spacy_doc)]
     assert len(expected) == len(observed)
def get_keyphrases_sgrank(text, idfs):
    doc = make_spacy_doc(bioclean_mod(text), lang='en')
    keyphrases = keyterms.sgrank(
        doc,
        ngrams       = tuple(range(1, 4)),
        normalize    = None,  # None, # u'lemma', # u'lower'
        window_width = 50,
        n_keyterms   = 5,
        idf          = None,
        include_pos  = ("NOUN", "PROPN", "ADJ"),  # ("NOUN", "PROPN", "ADJ"), # ("NOUN", "PROPN", "ADJ", "VERB", "CCONJ"),
    )
    if(len(keyphrases)==0):
        # print([(tok, idfs[tok] if tok in idfs else max_idf) for tok in doc if tok.pos=='NOUN'])
        toks_with_idfs  = [(tok, idfs[tok] if tok in idfs else max_idf) for tok in doc]
        toks_with_idfs  = sorted(toks_with_idfs, key=lambda x: x[1])
        keyphrases      = [(tt[0].text, tt[1]) for tt in toks_with_idfs]
    # return text, keyphrases
    return keyphrases
Esempio n. 29
0
def preprocess_category(in_path, out_path, category_name):
    import os
    spacy_lang = en_core_web_sm.load()
    print('===GIVEN CATEGORY: ' + category_name)
    for cat_path in glob.glob(in_path + '*'):
        _, cat_name = os.path.split(cat_path)
        print('===CATEGORY: ' + cat_name)
        if category_name == cat_name:
            print('###Fine, found category directory ...')
            #        for comp_path in glob.glob(kag_path + '/*'):
            corpus = textacy.corpus.Corpus(spacy_lang)
            texts = []
            for filename in glob.glob(cat_path + '/*.txt'):
                texts.append(open(filename, 'r').read().decode('utf-8'))
            corpus_text = '\n'.join(text for text in texts)
            corpus.add_text(corpus_text)
            #            _ , comp_file = os.path.split(comp_path)
            #            sindex = len(kag_name) + 1
            #            eindex = sindex + comp_file[sindex:].index('_')
            #            res_file = '{}.csv'.format(comp_file[sindex:eindex])
            res_file = '{}.csv'.format(category_name.lower())

            termList1 = term_list(
                keyterms.textrank(corpus[0], normalize=u'lower',
                                  n_keyterms=30))
            doc_idf = corpus.word_doc_freqs(lemmatize=None,
                                            weighting='idf',
                                            lowercase=True,
                                            as_strings=True)
            termList2 = term_list(
                keyterms.sgrank(corpus[0],
                                ngrams=(1, 2, 3),
                                normalize=u'lower',
                                window_width=500,
                                n_keyterms=70,
                                idf=doc_idf))
            termSet1 = set(termList1)
            termSet2 = set(termList2)
            diffSet = termSet1 - termSet2
            termList = termList2 + list(diffSet)
            save_terms_text(out_path + res_file, termList)
            break
Esempio n. 30
0
def preprocess_competences_combined(in_path, out_path):
    import os
    spacy_lang = en_core_web_sm.load()
    for kag_path in glob.glob(KAG_BASE_PATH + '/*'):
        _, kag_name = os.path.split(kag_path)
        for comp_path in glob.glob(kag_path + '/*'):
            corpus = textacy.corpus.Corpus(spacy_lang)
            texts = []
            for filename in glob.glob(comp_path + '/*.txt'):
                #                content = open(filename, 'r').read().decode('utf-8') # testing preprocess
                #                clean_text = preprocess_text(content, no_punct=True, no_contractions=True, no_accents=True)
                #                texts.append(clean_text)
                texts.append(open(filename, 'r').read().decode('utf-8'))
            corpus_text = '\n'.join(text for text in texts)
            corpus.add_text(corpus_text)
            _, comp_file = os.path.split(comp_path)
            sindex = len(kag_name) + 1
            eindex = sindex + comp_file[sindex:].index('_')
            res_file = '{}.csv'.format(comp_file[sindex:eindex])

            termList1 = term_list(
                keyterms.textrank(corpus[0], normalize=u'lower',
                                  n_keyterms=30))
            doc_idf = corpus.word_doc_freqs(lemmatize=None,
                                            weighting='idf',
                                            lowercase=True,
                                            as_strings=True)
            termList2 = term_list(
                keyterms.sgrank(corpus[0],
                                ngrams=(1, 2, 3),
                                normalize=u'lower',
                                window_width=500,
                                n_keyterms=70,
                                idf=doc_idf))
            termSet1 = set(termList1)
            termSet2 = set(termList2)
            diffSet = termSet1 - termSet2
            termList = termList2 + list(diffSet)
            save_terms_text(out_path + res_file, termList)
Esempio n. 31
0
def preprocess_competences2(in_path, out_path):
    import os
    spacy_lang = en_core_web_sm.load()
    for kag_path in glob.glob(in_path + '/*'):
        _, kag_name = os.path.split(kag_path)
        print('===KAG: ' + kag_name)
        for filename in glob.glob(kag_path + '/*.txt'):
            _, comp_file = os.path.split(filename)
            print('===competence file: ' + comp_file)
            if comp_file.index('.') >= 5:
                print('===preprocessing competence file: ' + comp_file)
                corpus = textacy.corpus.Corpus(spacy_lang)
                corpus.add_text(open(filename, 'r').read().decode('utf-8'))
                #                doc_idf = corpus.word_doc_freqs(lemmatize=None, weighting='idf', lowercase=True, as_strings=True)
                termList = term_list(
                    keyterms.sgrank(corpus[0],
                                    ngrams=(1, 2, 3),
                                    normalize=u'lower',
                                    idf=None))
                res_file = '{}.csv'.format(comp_file[:-4])
                print('===Writing to: ' + res_file)
                save_terms_text(out_path + res_file, termList)