Python Docの例、textacy.Doc Pythonの例

コード例 #1

0

ファイルを表示

ファイル: sva_rb2.py プロジェクト: empirical-org/Quill-NLP-Tools-and-Datasets

def simplify_compound_subjects(sentence_str):
    """Given a sentence doc, return a new sentence doc with compound subjects
    reduced to their simplest forms.

    'The man, the boy, and the girl went to school.'

    would reduce to 'They went to school'

    'The man, the boy, or the girls are frauds.'

    would reduce to 'The girls are frauds.'

    Sentences without a compund subject will not be changed at all."""

    sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')

    cs_patterns = \
            [r'((<DET>?(<NOUN|PROPN>|<PRON>)+<PUNCT>)+<DET>?(<NOUN|PROPN>|<PRON>)+<PUNCT>?<CCONJ><DET>?(<NOUN|PROPN>|<PRON>)+)|'\
            '(<DET>?(<NOUN|PROPN>|<PRON>)<CCONJ><DET>?(<NOUN|PROPN>|<PRON>))']

    for cs_pattern in cs_patterns:

        compound_subjects = textacy.extract.pos_regex_matches(
            sentence_doc, cs_pattern)

        chars_to_repl = [
        ]  # [(start_repl, end_repl, replacement), (start_repl,
        # end_repl, replacement), ...]

        for cs in compound_subjects:
            for w in cs:
                if w.pos_ == 'CCONJ' and w.text.lower() == 'and':
                    # replace with they
                    repl = 'they'.ljust(len(cs.text),
                                        '文')  # pad w unexpected char
                    chars_to_repl.append(
                        [cs[0].idx, cs[-1].idx + len(cs[-1].text), repl])

                elif w.pos_ == 'CCONJ' and w.text.lower() != 'and':
                    # replace with final <DET>?(<NOUN|PROPN>|<PRON>)
                    repl = cs[-1:].text
                    if cs[-2].pos_ == 'DET':
                        repl = cs[-2:].text
                    repl = repl.ljust(len(cs.text),
                                      '文')  # pad w unexpected char

                    chars_to_repl.append(
                        [cs[0].idx, cs[-1].idx + len(cs[-1].text), repl])

        new_sent_str = sentence_doc.text
        for replacement in chars_to_repl:
            new_sent_str = new_sent_str[:replacement[0]] + replacement[2] + \
                    new_sent_str[replacement[1]:]
        new_sent_str = new_sent_str.replace('文', '')
        new_sent_str = re.sub('\s+', ' ', new_sent_str).strip()

    sentence_doc = textacy.Doc(new_sent_str, lang='en_core_web_lg')
    return sentence_doc

コード例 #2

0

ファイルを表示

ファイル: utils.py プロジェクト: vishalbelsare/MFEAFP

def headline_to_svo(text):
    text_str = ''.join(filter(lambda x: x in string.printable, text))
    text_lower = text_str.lower()
    d1 = textacy.Doc(text_lower, lang=u"en")
    text_lower_str = str(text_lower)
    vs = textacy.extract.get_main_verbs_of_sent(d1)
    for v in vs:
        v_str = str(v)
        idx = text_lower_str.index(v_str)
        text_str = text_str[:idx] + v_str + text_str[idx + len(v):]
    d = textacy.Doc(text_str, lang=u"en")
    svo = textacy.extract.subject_verb_object_triples(d)
    return next(svo, None)

コード例 #3

0

ファイルを表示

ファイル: util_pt.py プロジェクト: zhongyunuestc/Keyword-Extraction-Bidirectional-LSTM

def kwd(sentence):

    try:
        txt_doc = textacy.Doc(sentence, lang="en_core_web_sm")
        kwds_sgrank = textacy.keyterms.sgrank(txt_doc, ngrams=(1,2), n_keyterms=3)
        kwds_sgrank = [kwd[0] for kwd in kwds_sgrank]
        kwds_sgrank_str = ', '.join(kwds_sgrank)
        kwds_textrank = textacy.keyterms.textrank(txt_doc)
        kwds_txtrnk = [kwd[0] for kwd in kwds_textrank]
        kw_uni = []
        for kw in kwds_txtrnk:

            if kw not in kwds_sgrank_str:
                kw_uni.append(kw)

        kwds = kwds_sgrank + kw_uni
        l_kwds = [kwd for kwd in kwds]
        # remove numbers from keywords
        l_kwds = [kwd for kwd in l_kwds if not any(str(v).isdigit() for v in kwd)]
        # removing very small keywords (to reduce noise)
        l_kwds = [kwd for kwd in l_kwds if len(kwd)>3]
        kwds = l_kwds

    except:
        kwds = []

    return kwds

コード例 #4

0

ファイルを表示

def build_comp_termlist():
    import os
    spacy_lang = en_core_web_sm.load()
    texts = []
    for kag_path in glob.glob(KAG_BASE_PATH + '/*'):
        for comp_path in glob.glob(kag_path + '/*'):
            _, comp_name = os.path.split(comp_path)
            #            print ('=Handling competence ' + comp_name)
            comp_text = ''
            for filename in glob.glob(comp_path + '/*.txt'):
                clean_text = preprocess_text(open(filename,
                                                  'r').read().decode('utf-8'),
                                             no_urls=True,
                                             no_emails=True,
                                             no_phone_numbers=True,
                                             no_numbers=True,
                                             no_currency_symbols=True,
                                             no_punct=True,
                                             no_contractions=True,
                                             no_accents=True)
                comp_text = ' '.join(text for text in (comp_text, clean_text))


#                comp_text += open(filename, 'r').read().decode('utf-8')
            doc = textacy.Doc(comp_text, lang=spacy_lang)
            texts.append(
                doc.to_terms_list(named_entities=False, as_strings=True))
    return texts

コード例 #5

0

ファイルを表示

ファイル: pipelines.py プロジェクト: elaisasearch/elaisa.org

    def extractNamedEntitiesAndCreateTextList(self, wordsWithoutStopWordsList):
        """
        Extract named entities from a given text and create a 
        list of all words in the article's text. In this case, 
        the named entites will be stored as one item, such as [...,'Angela Merkel, 'said',...]
        :text: String
        :returns: List
        """

        # transform text list to string, since removeStopWordsFromText() returns a list
        text = " ".join(w for w in wordsWithoutStopWordsList)

        # TODO: handle all used languages (en, es, de)
        doc = textacy.Doc(text, lang='en')
        entities = list(textacy.extract.named_entities(doc, exclude_types='numeric'))

        named_entities = [str(ent) for ent in entities]

        # replace named entites in text with 'tmpN' string
        for i, en in enumerate(named_entities):
            # Replace the first occuring named entity with the tmpN value. This is very important,
            # as there would be various tmp1 for example and the system wouldn't be able to change the values
            # in the next step.
            text = text.replace(en, 'tmp{}'.format(i), 1)

        # text string to list
        text = text.split()

        # change the 'tmpN' string in text list with named entity in entities list
        for j, ent in enumerate(named_entities):
            text[text.index('tmp{}'.format(j))] = ent

        # return the new text as list, that contains all words and named antities
        return text

コード例 #6

0

ファイルを表示

def similaritytoselection(selection,corpus):
    #selection is any selected text string e.g. model answer or any response selected from student responses
    #returns a list of tuples in order from most to least similar to selected text (on a scale of 1 - 0) where tuple is (similarity, selectedtext)
    s = textacy.Doc(selection)
    simlist = [(textacy.similarity.word2vec(s,doc), doc.text) for doc in corpus]
    simlist.sort(key = lambda v:-v[0])
    return simlist

コード例 #7

0

ファイルを表示

ファイル: respaldo.py プロジェクト: edisonchavezsa/sparql-django

def hola(request):
    datos = []
    nlp = spacy.load("es_core_news_md")

    if request.method == "POST":
        text_box_value = request.POST['text_box']
        print(text_box_value)

    contenido = text_box_value
    contenido = nlp(contenido)
    docs = textacy.Doc(contenido)
    sentencias = [s for s in docs.sents]
    print(len(sentencias))
    tipos = set(ent.label_ for ent in contenido.ents)

    def cleanup(token, lower=True):
        if lower:
            token = token.lower()
        return token.strip()

    labels = set([w.label_ for w in contenido.ents])
    personas = ""
    for label in labels:
        entities = [
            cleanup(e.string, lower=False) for e in contenido.ents
            if label == e.label_
        ]
        entities = list(set(entities))
        if label == "PER":
            personas = entities
        print(label, entities)

    print("Esta son las entidades personas")

    return render(request, 'hola.html', {'datos': datos})

コード例 #8

0

ファイルを表示

    def find_phrases(self, sentence, stop_tokens):
        doc = nlp(sentence)
        doc_grams = []
        unigrams = []
        for i in doc.noun_chunks:
            text = " ".join(
                [t.lemma_ if t.lemma_ != "-PRON-" else t.text for t in i])
            tokens = [
                t for t in text.split() if t != "" and t not in stop_tokens
            ]
            unigrams.extend(
                list(filter(lambda word: self.is_valid_word(word), tokens)))
            grams = self.generate_ngrams(tokens, 3)
            grams.extend(self.generate_ngrams(tokens, 2))
            for word in grams:
                if word not in stop_tokens:
                    doc_grams.append(space_join(word))

        pattern = r'<VERB>?<ADV>*<VERB>+'
        doc = textacy.Doc(sentence, lang=model)
        lists = textacy.extract.pos_regex_matches(doc, pattern)
        verbs_list = []
        for l in lists:
            verb_tokens = l.lemma_.split()
            for verb in verb_tokens:
                if verb not in stop_tokens and self.is_valid_word(verb):
                    verbs_list.append(verb)
        return doc_grams, unigrams, verbs_list

コード例 #9

0

ファイルを表示

def keywords():
    #print request.get_json()
    arg = request.get_json()
    doc = textacy.Doc(arg['content'],
                      metadata={'title': arg['title']},
                      lang=unicode('en_core_web_sm'))
    sgrank_keywords = dict(keyterms.sgrank(doc))
    singlerank_keywords = dict(keyterms.singlerank(doc))
    textrank_keywords = dict(keyterms.textrank(doc))
    sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items())
    textrank_keywords.update(
        (x, y * 0.05) for x, y in textrank_keywords.items())
    singlerank_keywords.update(
        (x, y * 0.05) for x, y in singlerank_keywords.items())
    keywords = res = dict(
        Counter(sgrank_keywords) + Counter(textrank_keywords) +
        Counter(singlerank_keywords))
    sorted_keywords = sorted(keywords.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    keyword_string = ""

    for i, key in enumerate(sorted_keywords):
        if (i == int(len(sorted_keywords) / 2)):
            keyword_string = keyword_string + "||"
        if (i == len(sorted_keywords) - 1
                or i == int(len(sorted_keywords) / 2) - 1):
            keyword_string = keyword_string + key[0]
        else:
            keyword_string = keyword_string + key[0] + ",,"

    return keyword_string

コード例 #10

0

ファイルを表示

ファイル: bcnlp_extract.py プロジェクト: timhutch/bitcurator-nlp-entspan

    def __init__(self, infile):
        efc = ExtractFileContents()
        print("INIT: Extract contents for infile: ", infile)

        input_file_contents = efc.extractContents(infile)
        metadata = {'filename': infile}
        self.doc = textacy.Doc(input_file_contents, metadata=metadata)

コード例 #11

0

ファイルを表示

def label_kwd(sentence):

    try:
        txt_doc = textacy.Doc(sentence, lang="en_core_web_sm")
        kwds_sgrank = textacy.keyterms.sgrank(txt_doc, ngrams=(1, 2))
        kwds_sgrank = [kwd[0] for kwd in kwds_sgrank]
        kwds_sgrank_str = ', '.join(kwds_sgrank)
        kwds_textrank = textacy.keyterms.textrank(txt_doc)
        kwds_txtrnk = [kwd[0] for kwd in kwds_textrank]
        kw_uni = []
        for kw in kwds_txtrnk:

            if kw not in kwds_sgrank_str:
                kw_uni.append(kw)

        kwds = kwds_sgrank + kwds_txtrnk
        l_kwds = [kwd for kwd in kwds]
        # remove numbers from keywords
        l_kwds = [
            kwd for kwd in l_kwds if not any(str(v).isdigit() for v in kwd)
        ]
        # removing very small keywords (to reduce noise)
        l_kwds = [kwd for kwd in l_kwds if len(kwd) > 3]
        kwds = ','.join(l_kwds)
        final_data = sentence + '\t' + kwds + '\n'

    except:
        final_data = ''

    print(final_data)
    print()
    return final_data

コード例 #12

0

ファイルを表示

def remove_adverbial_clauses(sentence_str):
    """Given a string, drop any adverbial clauses."""
    # should also return updated indexes
    # Sam , worried, asked him.
    # [0, 0, 0, 0, 3, 3, 3]
    tdoc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    advcl_phrases = []  #=> [(start.i, end.i), ...]
    has_advcl = False
    start = None
    for w in tdoc:
        if w.tag_ == ',' and has_advcl:  # end phrase, start next
            if start:  # end phrase if started
                advcl_phrases.append((start.i, w.i))
            start = w
            has_advcl = False
        elif w.tag_ == ',':  # start phrase
            start = w
            has_advcl = False
        if w.dep_ == 'advcl':
            has_advcl = True

    new_sent_str = sentence_str
    unusual_char = '形'
    for advcl in advcl_phrases:
        start = tdoc[advcl[0]].idx
        end = tdoc[advcl[1]].idx + len(tdoc[advcl[1]].text)
        sub = unusual_char * (end - start)
        new_sent_str = new_sent_str[:start] + sub + new_sent_str[end:]
    new_sent_str = new_sent_str.replace(unusual_char, '')
    return new_sent_str

コード例 #13

0

ファイルを表示

    def phrases(self, clean_text):
        all_lemmas = lemmatize(clean_text, stopwords=self.stopwords)
        curated_words = [str(word).split('/')[0] for word in all_lemmas]
        curated_text = ' '.join(curated_words)

        doc = textacy.Doc(curated_text, lang='en')

        all_phrases = []
        all_phrases += textacy.extract.ngrams(doc,
                                              2,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              3,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              4,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              5,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)

        phrases = [str(phrase) for phrase in all_phrases]

        return phrases

コード例 #14

0

ファイルを表示

ファイル: sva_rb2.py プロジェクト: empirical-org/Quill-NLP-Tools-and-Datasets

def substitute_infinitives_as_subjects(sent_str):
    """If an infinitive is used as a subject, substitute the gerund."""
    sent_doc = textacy.Doc(sent_str, lang='en_core_web_lg')
    #inf_pattern = r'<PART><VERB>+' # To aux/auxpass* csubj
    inf_pattern = r'<PART><VERB>'  # To aux/auxpass* csubj
    infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern)
    inf_subjs = []  # => [[0,1],...]
    for inf in infinitives:
        if inf[0].text.lower() != 'to':
            continue
        if ('csubj' not in [w.dep_ for w in inf]
                and sent_doc[inf[-1].i + 1].dep_ != 'csubj'):
            continue
        if inf[-1].tag_ != 'VB':
            continue
        inf_subj = []
        for v in inf:
            inf_subj.append(v.i)
        inf_subjs.append(inf_subj)
    new_sent_str = sent_str
    unusual_char = '形'
    for inf_subj in inf_subjs:
        start_inf = sent_doc[inf_subj[0]].idx
        end_inf = sent_doc[inf_subj[-1]].idx + len(sent_doc[inf_subj[-1]])
        inf_len = end_inf - start_inf
        sub = (unusual_char * inf_len)
        new_sent_str = new_sent_str[:start_inf] + sub + new_sent_str[end_inf:]
    new_sent_str = re.sub('形+', '{}', new_sent_str)
    repl = [
        conjugate(sent_doc[i_s[-1]].text, tense='presentparticiple')
        for i_s in inf_subjs
    ]
    return new_sent_str.format(*repl)

コード例 #15

0

ファイルを表示

def v_phrase_scores(sentence=None, phrase_type='verb'):

    if (phrase_type == 'verb'):
        pattern = r'<VERB>?<ADV>*<VERB>+'
    elif (phrase_type == 'noun'):
        pattern = r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'
    else:
        print('Unregognized phrase type')
        return

    doc = textacy.Doc(sentence, lang='en_core_web_sm')
    lists = textacy.extract.pos_regex_matches(doc, pattern)

    tot_phrase_score = {}
    tot_phrase_score['pos'] = 0
    tot_phrase_score['neg'] = 0
    tot_phrase_score['neu'] = 0
    tot_phrase_score['compound'] = 0
    for list in lists:
        phrase_score = v_sentiment_scores(list.text)
        tot_phrase_score['pos'] += phrase_score['pos']
        tot_phrase_score['neg'] += phrase_score['neg']
        tot_phrase_score['neu'] += phrase_score['neu']
        tot_phrase_score['compound'] += phrase_score['compound']
    return tot_phrase_score

コード例 #16

0

ファイルを表示

def word_stats(tree):
    """Returns a bunch of textacy stats on the text"""
    stats = None
    try:
        text = tree.find("mainText").text
        doc = textacy.Doc(text)
        stats = textacy.text_stats.TextStats(doc)
    except Exception as e:
        print(e)
    return stats

コード例 #17

0

ファイルを表示

 def setUp(self):
     self.spacy_lang = textacy.data.load_spacy('en')
     self.cw = textacy.datasets.CapitolWords()
     self.text = list(self.cw.texts(speaker_name={'Bernie Sanders'}, limit=1))[0]
     self.doc = textacy.Doc(self.text.strip(), lang=self.spacy_lang)
     records = self.cw.records(speaker_name={'Bernie Sanders'}, limit=10)
     text_stream, metadata_stream = textacy.fileio.split_record_fields(
         records, 'text')
     self.corpus = textacy.Corpus(
         self.spacy_lang, texts=text_stream, metadatas=metadata_stream)

コード例 #18

0

ファイルを表示

ファイル: sva_rb2.py プロジェクト: empirical-org/Quill-NLP-Tools-and-Datasets

def raise_infinitive_error(sentence_str):
    """Given a string, check that all infinitives are properly formatted"""
    sent_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    inf_pattern = r'<PART|ADP><VERB>'  # To aux/auxpass* csubj
    infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern)
    for inf in infinitives:
        if inf[0].text.lower() != 'to':
            continue
        if inf[-1].tag_ != 'VB':
            raise Exception('InfinitivePhraseError')

コード例 #19

0

ファイルを表示

def get_new_doc(phrase, doc_type='textacy'):
    assert isinstance(phrase, basestring)
    assert isinstance(doc_type, str)
    assert doc_type in possible_docs, "Only {} doc types are supported".format(
        possible_docs)

    if doc_type == 'textacy':
        return textacy.Doc(phrase, lang=lang_en)
    elif doc_type == 'spacy':
        return nlp(phrase)

コード例 #20

0

ファイルを表示

ファイル: KnowledgeGraph.py プロジェクト: Sirsirious/textO

    def wordTuples(graph, textEntry):
        text = rootify(graph, textEntry)
        pt = textacy.load_spacy('pt')
        doc = textacy.Doc(text, lang=pt)
        ts = textacy.TextStats(doc)
        words = [{
            w[0]: w[1]
        } for w in textacy.keyterms.textrank(
            doc, normalize='lower', n_keyterms=ts.n_unique_words)]

        return words

コード例 #21

0

ファイルを表示

def extractNamedEntities(query: str, language: str) -> list:
    """
    Extract Phrases from a given query sentence string
    :query: String
    :language: String
    :returns: List
    """
    doc = textacy.Doc(query, lang=language)
    entities = list(textacy.extract.named_entities(doc))

    return [str(ent) for ent in entities]

コード例 #22

0

ファイルを表示

ファイル: sva_rb2.py プロジェクト: empirical-org/Quill-NLP-Tools-and-Datasets

def split_infinitive_warning(sentence_str):
    """Return a warning for a split infinitive, else, None"""
    sent_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    inf_pattern = r'<PART><ADV><VERB>'  # To aux/auxpass* csubj
    infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern)
    for inf in infinitives:
        if inf[0].text.lower() != 'to':
            continue
        if inf[-1].tag_ != 'VB':
            continue
        return 'SplitInfinitiveWarning'

コード例 #23

0

ファイルを表示

def get_verb_chunks(sent):
    verb_chunks = []
    pattern = r'<VERB>?<ADV>*<VERB>+'
    doct = textacy.Doc(sent, lang='en_core_web_sm')
    lists = textacy.extract.pos_regex_matches(doct, pattern)
    for list in lists:
        verb_chunks.append(list.text)
    # print(list.text)

    print(verb_chunks)

    return verb_chunks

コード例 #24

0

ファイルを表示

ファイル: find_subjects.py プロジェクト: brianthomas/adass_subject_recommender

def find_terms(content: str,
               maxterms=MAX_SUGGESTED_KEYTERMS,
               ngrams_to_extract=(1, 2, 3)) -> list:
    '''
    Find NGRAMS of significance in passed text
    will look for ngrams up to 3 words by default.
    Returns a list of (term, frequency) tuples sorted by frequency
    '''
    import textacy
    import textacy.keyterms
    import io
    from spacy.lang.en.stop_words import STOP_WORDS

    # if we used standard in, read the io.TextIOWrapper class
    # otherwise just accept as is
    if type(content) == io.TextIOWrapper:
        text = content.read()
    else:
        # its asumed to be str
        text = content

    LOG.debug(f'''content has %d chars''', len(text))

    # find tags here
    tags = []

    #lang_en = spacy.util.get_lang_class('en')
    doc = textacy.Doc(text)
    LOG.debug(doc)

    # extract keyterms to make suggestions for new ADASS subject terms
    keyterms = textacy.keyterms.textrank(doc,
                                         normalize='lemma',
                                         n_keyterms=maxterms)
    LOG.debug(f'''KEYTERMS: %s''', keyterms)

    # We'll use the Bag of terms, ngrams by frequency, to find relevant matches with
    # existing terms in the ADASS dictionary
    bot = doc.to_bag_of_terms(ngrams=ngrams_to_extract,
                              lemmatize=True,
                              named_entities=True,
                              weighting='count',
                              as_strings=True)

    # For some reason we see stopwords in the BoT, so make another pass to clean out stopwords
    # and the empty string then print top 15 number of terms by occurance
    cleaned_bot = [(term, cnt) for term, cnt in bot.items()
                   if term not in STOP_WORDS and term != '']
    sorted_cleaned_bot = sorted(cleaned_bot, key=lambda x: x[1], reverse=True)
    LOG.debug(f'''BAG of Terms (top, cleaned): %s''',
              sorted_cleaned_bot[:MAX_BOT_TERMS])

    return {'ngrams': sorted_cleaned_bot, 'keyterms': keyterms}

コード例 #25

0

ファイルを表示

    def extract_keyphrases(self, algorithm, **kwargs):
        """ Method for extracting keyphrases from text
            algorithm takes 'str' object -> get function using eval
                           'func' object
            **kwargs: parameters for algorithm
        """

        if isinstance(algorithm, str):
            algorithm = eval('tkt.{}'.format(algorithm))

        doc = textacy.Doc(self.text, lang='en')
        self.keyphrases = list(algorithm(doc, **kwargs))

コード例 #26

0

ファイルを表示

ファイル: sva_rb2.py プロジェクト: empirical-org/Quill-NLP-Tools-and-Datasets

def drop_modifiers(sentence_str):
    """Given a string, drop the modifiers and return a string 
    without them"""
    tdoc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    new_sent = tdoc.text
    unusual_char = '形'
    for tag in tdoc:
        if tag.dep_.endswith('mod'):
            # Replace the tag
            new_sent = new_sent[:tag.idx] + unusual_char * len(tag.text) +\
                    new_sent[tag.idx + len(tag.text):]
    new_sent = new_sent.replace(unusual_char, '')
    new_sent = textacy.preprocess.normalize_whitespace(new_sent)
    return new_sent

コード例 #27

0

ファイルを表示

ファイル: sva_rb2.py プロジェクト: empirical-org/Quill-NLP-Tools-and-Datasets

def remove_prepositional_phrases(sentence_str):
    sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    # possessive pronouns labled as ADJ
    pp_pattern = r'<ADP><ADJ|DET>?(<NOUN>+<ADP>)*<NOUN>+'
    prep_phrases = textacy.extract.pos_regex_matches(sentence_doc, pp_pattern)
    new_sent_str = sentence_str
    unusual_char = '形'
    for pp in prep_phrases:
        sub = unusual_char * len(pp.text)
        new_sent_str = new_sent_str[:pp[0].
                                    idx] + sub + new_sent_str[pp[0].idx +
                                                              len(pp.text):]
    new_sent_str = new_sent_str.replace(unusual_char, '')
    return new_sent_str

コード例 #28

0

ファイルを表示

ファイル: get_live_hashtag_sentiment.py プロジェクト: tyleratk/twitter-sentiment

    def show_plot(self, pos, neg, plot_type='semantic'):
        for group in [('Positive', pos), ('Negative', neg)]:
            name, group = group
            if plot_type == 'semantic':
                # if name == 'Positive' and group.shape[0] > 150:
                #     group = group.sample(155)
                corpus = [self.prep_tokens(tweet) for tweet in group]
                corpus = ' '.join(word for word in corpus)
                cleaned_text = textacy.preprocess_text(corpus,
                                                       fix_unicode=True,
                                                       no_accents=True)
                doc = textacy.Doc(cleaned_text, lang='en')
                graph = doc.to_semantic_network(nodes='words',
                                                edge_weighting='cooc_freq',
                                                window_width=10)
                drop_nodes = ['pron']
                for node in drop_nodes:
                    try:
                        graph.remove_node(node)
                    except:
                        pass
                node_weights = nx.pagerank_scipy(graph)
                ax = textacy.viz.network.draw_semantic_network(
                    graph, node_weights=node_weights, spread=50.0)
                plt.suptitle(name + ' Sentiment Topics:' +
                             '\n{} {} tweets\n{}'.format(
                                 group.shape[0], name, self.hashtag))
                # plt.savefig('../images/plots/' + name)
            else:
                corpus = [self.prep_tokens(tweet) for tweet in group]
                tf = TfidfVectorizer().fit(corpus)

                doc_term_matrix = tf.transform(corpus)
                vocab = tf.get_feature_names()
                vocab = [word for word in vocab if word != 'pron']

                model = textacy.tm.TopicModel('nmf', n_topics=3)
                model.fit(doc_term_matrix)
                model.termite_plot(doc_term_matrix,
                                   vocab,
                                   topics=-1,
                                   n_terms=25,
                                   sort_terms_by='seriation',
                                   rank_terms_by='topic_weight',
                                   highlight_topics=range(3))
                plt.suptitle(name + ' Sentiment Topics:')
                # plt.savefig('semantic_plot')

        plt.show(block=False)

コード例 #29

0

ファイルを表示

def generate_corpus(comment_json):
    my_corpus = []
    with open(comment_json) as input_file:
        article_comments = json.load(input_file)

    comments = list(article_comments.values())[0]['comments']
    for cmt in comments:
        md = {
            'comment_author': cmt['comment_author'],
            'comment_time': cmt['comment_time']
        }
        doc = textacy.Doc(cmt['comment'], metadata=md)
        my_corpus.append(doc)

    return my_corpus

コード例 #30

0

ファイルを表示

def remove_prepositional_phrases(sentence_str):
    """Given a string, drop the prepositional phrases and return a new string
    without them"""
    sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    pp_pattern = r'<ADP><ADJ|DET>?(<NOUN>+<ADP>)*<NOUN>+'
    prep_phrases = textacy.extract.pos_regex_matches(sentence_doc, pp_pattern)
    new_sent_str = sentence_str
    unusual_char = '形'
    for pp in prep_phrases:
        sub = unusual_char * len(pp.text)
        new_sent_str = new_sent_str[:pp[0].
                                    idx] + sub + new_sent_str[pp[0].idx +
                                                              len(pp.text):]
    new_sent_str = new_sent_str.replace(unusual_char, '')
    return new_sent_str