Exemple #1
0
def spacy_doc():
    spacy_lang = cache.load_spacy_lang("en")
    text = (
        "Two weeks ago, I was in Kuwait participating in an I.M.F. (International Monetary Fund) seminar for Arab educators. "
        "For 30 minutes, we discussed the impact of technology trends on education in the Middle East. "
        "And then an Egyptian education official raised his hand and asked if he could ask me a personal question: \"I heard Donald Trump say we need to close mosques in the United States,\" he said with great sorrow. "
        "\"Is that what we want our kids to learn?\"")
    spacy_doc = spacy_lang(text)
    return spacy_doc
Exemple #2
0
 def test_to_tokenized_text_nosents(self):
     spacy_lang = cache.load_spacy_lang("en")
     with spacy_lang.disable_pipes("parser"):
         doc = spacy_lang("This is sentence #1. This is sentence #2.")
     tokenized_text = doc._.to_tokenized_text()
     assert isinstance(tokenized_text, list)
     assert len(tokenized_text) == 1
     assert isinstance(tokenized_text[0], list)
     assert isinstance(tokenized_text[0][0], compat.unicode_)
Exemple #3
0
 def test_corpus_init_docs(self):
     limit = 3
     spacy_lang = cache.load_spacy_lang("en")
     texts = DATASET.texts(limit=limit)
     docs = [spacy_lang(text) for text in texts]
     corpus = Corpus("en", data=docs)
     assert len(corpus) == len(corpus.docs) == limit
     assert all(doc.vocab is corpus.spacy_lang.vocab for doc in corpus)
     assert all(doc1 is doc2 for doc1, doc2 in zip(docs, corpus))
Exemple #4
0
def spacy_lang():
    spacy_lang = cache.load_spacy_lang("en")
    text_stats_component = components.TextStatsComponent()
    spacy_lang.add_pipe(text_stats_component, after="parser")

    yield spacy_lang

    # remove component after running these tests
    spacy_lang.remove_pipe("textacy_text_stats")
def spacy_doc():
    spacy_lang = cache.load_spacy_lang("en")
    text = """
    The unit tests aren't going well.
    I love Python, but I don't love backwards incompatibilities.
    No programmers were permanently damaged for textacy's sake.
    Thank God for Stack Overflow.
    """
    spacy_doc = spacy_lang(text.strip())
    return spacy_doc
Exemple #6
0
 def test_invalid_data_lang_combo(self):
     spacy_lang = cache.load_spacy_lang("en")
     combos = (
         (spacy_lang("Hello, how are you my friend?"), "es"),
         (spacy_lang("Hello, how are you my friend?"), True),
         ("This is an English sentence.", True),
         (("This is an English sentence.", {
             "foo": "bar"
         }), True),
     )
     for data, lang in combos:
         with pytest.raises((ValueError, TypeError)):
             _ = make_spacy_doc(data, lang=lang)
Exemple #7
0
def test_to_gensim(spacy_doc):
    spacy_lang = cache.load_spacy_lang("en")
    result = export.docs_to_gensim(
        [spacy_doc], spacy_lang.vocab,
        filter_stops=True, filter_punct=True, filter_nums=True,
    )
    assert isinstance(result[0], compat.unicode_)
    assert isinstance(result[1], list)
    assert isinstance(result[1][0], list)
    assert isinstance(result[1][0][0], tuple)
    assert (
        isinstance(result[1][0][0][0], compat.int_types)
        and isinstance(result[1][0][0][1], compat.int_types)
    )
Exemple #8
0
def spacy_doc():
    spacy_lang = cache.load_spacy_lang("en")
    text = """
    Friedman joined the London bureau of United Press International after completing his master's degree. He was dispatched a year later to Beirut, where he lived from June 1979 to May 1981 while covering the Lebanon Civil War. He was hired by The New York Times as a reporter in 1981 and re-dispatched to Beirut at the start of the 1982 Israeli invasion of Lebanon. His coverage of the war, particularly the Sabra and Shatila massacre, won him the Pulitzer Prize for International Reporting (shared with Loren Jenkins of The Washington Post). Alongside David K. Shipler he also won the George Polk Award for foreign reporting.

    In June 1984, Friedman was transferred to Jerusalem, where he served as the New York Times Jerusalem Bureau Chief until February 1988. That year he received a second Pulitzer Prize for International Reporting, which cited his coverage of the First Palestinian Intifada. He wrote a book, From Beirut to Jerusalem, describing his experiences in the Middle East, which won the 1989 U.S. National Book Award for Nonfiction.

    Friedman covered Secretary of State James Baker during the administration of President George H. W. Bush. Following the election of Bill Clinton in 1992, Friedman became the White House correspondent for the New York Times. In 1994, he began to write more about foreign policy and economics, and moved to the op-ed page of The New York Times the following year as a foreign affairs columnist. In 2002, Friedman won the Pulitzer Prize for Commentary for his "clarity of vision, based on extensive reporting, in commenting on the worldwide impact of the terrorist threat."

    In February 2002, Friedman met Saudi Crown Prince Abdullah and encouraged him to make a comprehensive attempt to end the Arab-Israeli conflict by normalizing Arab relations with Israel in exchange for the return of refugees alongside an end to the Israel territorial occupations. Abdullah proposed the Arab Peace Initiative at the Beirut Summit that March, which Friedman has since strongly supported.

    Friedman received the 2004 Overseas Press Club Award for lifetime achievement and was named to the Order of the British Empire by Queen Elizabeth II.

    In May 2011, The New York Times reported that President Barack Obama "has sounded out" Friedman concerning Middle East issues.
    """
    spacy_doc = spacy_lang(preprocess_text(text), disable=["parser"])
    return spacy_doc
Exemple #9
0
def main():
    args = parser.parse_args()

    # if parsing input
    if args.parse_infile:
        with open(args.infile, 'r', encoding="utf8", errors='ignore') as f:
            pipeline = load_spacy_lang('en',
                                       disable=('tagger', 'ner', 'textcat'))
            corp = Corpus(pipeline)
            corp.add(f)
            corp.save(args.infile.rsplit('.', 1)[0] + '.corp')
    # if loading a preparsed corpus
    else:
        corp = Corpus.load('en', args.infile)

    # dictionary mapping dependency triples to their frequencies
    features = defaultdict(lambda: defaultdict(lambda: set()))

    # for every token in the corpus
    for tok in chain.from_iterable(corp.docs):
        if tok.is_alpha and tok.head.is_alpha and tok.dep_ in DEP_TYPES:
            feature = (tok.dep, tok.lemma)
            features[tok.head.pos][tok.head.lemma_.lower()].add(feature)

    top_similar = []
    for pos in features.keys():
        total_num = len(features[pos])
        global prob_cache
        prob_cache = {}
        for word_1, word_2 in combinations(features[pos].keys(), 2):
            sim = calc_similarity(features[pos], total_num, word_1, word_2,
                                  args.feat_thresh)
            if sim is None:
                continue

            if len(top_similar) < args.num:
                top_similar.append((sim, word_1, word_2))
            elif sim > top_similar[-1][0]:
                top_similar[-1] = (sim, word_1, word_2)
                top_similar.sort(reverse=True)

    print('{:>15} {:>15} {:>12}'.format('word 1', 'word 2', 'similarity'))
    for pair in top_similar:
        print('{0[1]:>15} {0[2]:>15} {0[0]:>12.3f}'.format(pair))
    print()
Exemple #10
0
def main():
    args = parser.parse_args()

    # if parsing input
    if args.parse_infile:
        with open(args.infile, 'r', encoding="utf8", errors='ignore') as f:
            pipeline = load_spacy_lang('en', disable=('tagger', 'ner', 'textcat'))
            corp = Corpus(pipeline)
            corp.add(f)
            corp.save(args.infile.rsplit('.', 1)[0] + '.corp')
    # if loading a preparsed corpus
    else:
        corp = Corpus.load('en', args.infile)

    # dictionary mapping dependency triples to their frequencies
    dep_triples = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))

    # for every token in the corpus
    for tok in chain.from_iterable(corp.docs):
        # if this token and its head aren't alpha, skip them
        if not (tok.is_alpha and tok.head.is_alpha):
            continue
        # if this token is part of a compound word
        # and the word it's connected to has a dependency we care about
        # add it to the dictionary
        if tok.dep_ == 'compound' and tok.head.dep_ in DEP_TYPES:
            full_word = tok.lemma_.lower() + ' ' + tok.head.lemma_.lower()
            dep_triples[tok.head.dep_][full_word][tok.head.head.lemma_.lower()] += 1
        # if this token has a dependency we care about and hasn't already been included
        # by the previous condition
        if tok.dep_ in DEP_TYPES and not any(c.dep_ == 'compound' for c in tok.children):
            dep_triples[tok.dep_][tok.lemma_.lower()][tok.head.lemma_.lower()] += 1

    mods_with_minfo = calc_minfo_for_set(dep_triples, args.dep, args.head, args.const)   
    if not mods_with_minfo:
        sys.exit(1)

    print('mutual info for words with relationship {} to "{}" using constant {}:'.format(args.dep, args.head, args.const))
    print()
    print('{:>12} {:>30} {:>5}'.format('mutual info', 'word', 'freq'))

    for i, result in enumerate(mods_with_minfo):
        if i > args.num:
            return
        print('{0[0]:>12.5} {0[1]:>30} {0[2]:>5}'.format(result))
Exemple #11
0
 def test_corpus_add(self, corpus):
     spacy_lang = cache.load_spacy_lang("en")
     datas = (
         "This is an english sentence.",
         ("This is an english sentence.", {
             "foo": "bar"
         }),
         spacy_lang("This is an english sentence."),
         ["This is one sentence.", "This is another sentence."],
         [("This is sentence #1.", {
             "foo": "bar"
         }), ("This is sentence #2.", {
             "bat": "baz"
         })],
         [
             spacy_lang("This is sentence #1"),
             spacy_lang("This is sentence #2")
         ],
     )
     n_docs = corpus.n_docs
     for data in datas:
         corpus.add(data)
         assert corpus.n_docs > n_docs
         n_docs = corpus.n_docs
Exemple #12
0
def langs():
    return ("en", cache.load_spacy_lang("en"), lambda text: "en")
Exemple #13
0
 def test_doc_data(self, langs):
     spacy_lang = cache.load_spacy_lang("en")
     doc = spacy_lang("This is an English sentence.")
     assert isinstance(make_spacy_doc(doc), spacy.tokens.Doc)
     for lang in langs:
         assert isinstance(make_spacy_doc(doc, lang=lang), spacy.tokens.Doc)
Exemple #14
0
def empty_spacy_doc():
    spacy_lang = cache.load_spacy_lang("en")
    return spacy_lang("")
Exemple #15
0
def doc(text):
    spacy_lang = cache.load_spacy_lang("en")
    return make_spacy_doc(text, lang=spacy_lang)
Exemple #16
0
def spacy_doc():
    text = "I would have lived in peace. But my enemies brought me war."
    spacy_lang = cache.load_spacy_lang("en")
    spacy_doc = spacy_lang(text)
    return spacy_doc
Exemple #17
0
 def test_corpus_init_no_parser(self):
     spacy_lang = cache.load_spacy_lang("en", disable=("parser", ))
     corpus = Corpus(spacy_lang,
                     data=(spacy_lang("This is a sentence in a doc."), ))
     assert len(corpus) == 1
     assert corpus.n_sents == 0
Exemple #18
0
 def test_bad_name(self):
     for name in ("unk", "un"):
         with pytest.raises((OSError, IOError)):
             _ = cache.load_spacy_lang(name)
     with pytest.raises(ImportError):
         _ = cache.load_spacy_lang("un", allow_blank=True)
Exemple #19
0
 def test_disable_hashability(self):
     with pytest.raises(TypeError):
         _ = cache.load_spacy_lang("en",
                                   disable=["tagger", "parser", "ner"])
Exemple #20
0
 def test_load_blank(self):
     assert isinstance(cache.load_spacy_lang("ar", allow_blank=True),
                       spacy.language.Language)
Exemple #21
0
 def test_corpus_init_lang(self):
     assert isinstance(Corpus("en"), Corpus)
     assert isinstance(Corpus(cache.load_spacy_lang("en")), Corpus)
     for bad_lang in (b"en", None):
         with pytest.raises(TypeError):
             Corpus(bad_lang)
def test_make_doc_from_text_chunks():
    text = "Burton forgot to add tests for this function."
    for lang in ("en", cache.load_spacy_lang("en")):
        spacy_doc = utils.make_doc_from_text_chunks(text, lang)
        assert isinstance(spacy_doc, Doc)
        assert spacy_doc.text == text
Exemple #23
0
def spacy_doc():
    spacy_lang = cache.load_spacy_lang("en")
    spacy_doc = spacy_lang(TEXT)
    return spacy_doc
Exemple #24
0
 def test_load_model(self):
     for lang in ["en", "en_core_web_sm"]:
         for disable in [None, ("tagger", "parser", "ner")]:
             assert isinstance(cache.load_spacy_lang(lang, disable=disable),
                               spacy.language.Language)