Esempio n. 1
0
    def pos_tag(self,text):

        token_text = text
        text = ' '.join(text)

        if self.lang=='en':
            return nltk.pos_tag(token_text)

        elif self.lang in ['ro']:

            tags = self.tagger.predict([features(token_text, index) for index in range(len(token_text))])
            return self.change_ud2penn([(token_text[i], tags[i]) for i in range(len(token_text))])
 
        elif self.lang=='ar':

            pipeline = stanfordnlp.Pipeline(models_dir="models/stanford", lang='ar', use_gpu=False, processors='tokenize,pos')
            doc = pipeline(text)
            tags = [(word.text, word.upos) for sent in doc.sentences for word in sent.words]         
            
            return tags
 
        elif self.lang=='es':
            # blob =  Text(text, hint_language_code=self.lang)
            # return self.change_ud2penn(blob.pos_tags)
            pipeline = stanfordnlp.Pipeline(models_dir="models/stanford", lang='es', use_gpu=False, processors='tokenize,pos')
            doc = pipeline(text)
            tags = [(word.text, word.upos) for sent in doc.sentences for word in sent.words]         
            return tags
       
        elif self.lang =='fr':
            return nltk.pos_tag(token_text)
Esempio n. 2
0
def preprocess_text(samples,
                    lowercase=False,
                    lemmatize=False,
                    remove_diacritics=False):

    if lemmatize:
        pipeline = stanfordnlp.Pipeline(processors='tokenize,pos,lemma',
                                        lang='sk')
    else:
        pipeline = stanfordnlp.Pipeline(processors='tokenize', lang='sk')

    for i, sample in enumerate(samples):
        doc = pipeline(sample.text)

        if lemmatize:
            tokens = [
                word.lemma for sentence in doc.sentences
                for word in sentence.words
            ]
        else:
            tokens = [
                word.text for sentence in doc.sentences
                for word in sentence.words
            ]

        if lowercase:
            tokens = [token.lower() for token in tokens]
        if remove_diacritics:
            tokens = [unidecode.unidecode(token) for token in tokens]
        samples[i].text = tokens

    return samples
 def __init__(self, nlp=None, lang="tr"):
     if nlp:
         self.nlp = nlp
     else:
         try:
             self.nlp = stanfordnlp.Pipeline(lang=lang)
         except:
             stanfordnlp.download(lang)
             self.nlp = stanfordnlp.Pipeline(lang=lang)
Esempio n. 4
0
 def __init__(self,
              lang: str,
              annotators: Union[str, List[str]],
              use_corenlp=False,
              models_dir="",
              pretokenized=False,
              merge_one_sent=False,
              sep_syms=""):
     self.lang = lang
     self.corenlp_language_name = LANG_NAME_MAP[lang]
     self.use_corenlp = use_corenlp
     self.models_dir = models_dir
     if isinstance(annotators, str):
         self.annotators = annotators.split(",")
     else:
         self.annotators = annotators
     self.annotators_set = set(self.annotators)
     self.pretokenized = pretokenized
     self.merge_one_sent = merge_one_sent
     self.need_pos, self.need_lemma, self.need_dep = [
         (z in self.annotators_set) for z in ['pos', 'lemma', 'depparse']
     ]
     self.sep_syms_set = set(list(
         sep_syms))  # separate these symbols (add spaces) apart especially
     if len(self.sep_syms_set) > 0:
         assert not self.pretokenized, "No processing for sep_syms if pretokenized!!"
     # -----
     # currently only support certain operations
     if use_corenlp:
         assert all(
             (z in {'tokenize', 'ssplit', 'pos', 'lemma', 'depparse'})
             for z in self.annotators)
         assert "CORENLP_HOME" in os.environ, "CORENLP_HOME not found, please set this for CORENLP!!"
         zlog(
             f"Start tokenizer with corenlp, home={os.environ['CORENLP_HOME']} and self = {self.__dict__}"
         )
         # self.corenlp = None  # binded at running time
         self.nlp = None
     else:  # no mwt since that may split words (which we ignore for now)
         # assert all((z in {'tokenize','mwt','pos','lemma','depparse'}) for z in self.annotators)
         assert all((z in {'tokenize', 'pos', 'lemma', 'depparse'})
                    for z in self.annotators)
         zlog(f"Start tokenizer with stanfordnlp, self = {self.__dict__}")
         # self.corenlp = None
         annotators_str = ",".join(self.annotators)
         if len(models_dir) > 0:
             self.nlp = stanfordnlp.Pipeline(
                 processors=annotators_str,
                 lang=self.lang,
                 tokenize_pretokenized=pretokenized,
                 models_dir=models_dir)
         else:  # use default dir
             self.nlp = stanfordnlp.Pipeline(
                 processors=annotators_str,
                 lang=self.lang,
                 tokenize_pretokenized=pretokenized)
Esempio n. 5
0
 def __init__(self, lang="en", models_dir=""):
     # mostly using stanfordnlp
     self.lang = lang
     if len(models_dir) > 0:
         self.parser = stanfordnlp.Pipeline(
             processors='tokenize,mwt,pos,lemma,depparse',
             lang=lang,
             tokenize_pretokenized=True,
             models_dir=models_dir)
     else:
         self.parser = stanfordnlp.Pipeline(
             processors='tokenize,mwt,pos,lemma,depparse',
             lang=lang,
             tokenize_pretokenized=True)
Esempio n. 6
0
def tag_relations(text, terms, bags, nlp=None):
    """ Modified version of tag relations that handles the special case of making predictions
        on new data without known relation labels.
    """

    # default to Stanford NLP pipeline wrapped in Spacy
    if nlp is None:
        snlp = stanfordnlp.Pipeline(lang="en")
        nlp = StanfordNLPLanguage(snlp)

    # preprocess with spacy if needed
    if type(terms[0]) != spacy.tokens.doc.Doc:
        terms = [nlp(term) for term in terms]
    if (type(text) != spacy.tokens.doc.Doc
            and type(text) != spacy.tokens.span.Span):
        text = nlp(text)

    results = tag_terms(text, terms, nlp)
    tokenized_text = results["tokenized_text"]
    tagged_text = results["tags"]
    found_terms_info = results["found_terms"]

    found_terms = list(found_terms_info.keys())
    for i in range(len(found_terms) - 1):
        for j in range(i + 1, len(found_terms)):
            term_pair = (found_terms[i], found_terms[j])
            bags = add_relation(term_pair, found_terms_info, tokenized_text,
                                bags)
            term_pair_reverse = (found_terms[j], found_terms[i])
            bags = add_relation(term_pair_reverse, found_terms_info,
                                tokenized_text, bags)

    return bags
Esempio n. 7
0
def tokenize(fpath, lang):

    content = open(fpath).read()
    paragraphs = re.split(r'\n+', content)
    res_sents = []
    res_pars = []
    res_pos = []
    start_par = 0
    for par in paragraphs:
        par = par.strip()
        if not par:
            continue
        doc = stanfordnlp.Document(par)
        nlp = stanfordnlp.Pipeline(lang=lang, processors="tokenize,mwt,pos")
        doc = nlp(doc)
        #print(doc.conll_file.conll_as_string())
        #print(doc.conll_file.sents)
        sents = [[token[1] for token in sent if '-' not in token[0]]
                 for sent in doc.conll_file.sents]
        pos = [[token[3] for token in sent if '-' not in token[0]]
               for sent in doc.conll_file.sents]
        res_sents.extend(sents)
        res_pos.extend(pos)
        length = sum((len(s) for s in sents))
        res_pars.append([start_par, start_par + length - 1])
        start_par = start_par + length
    return res_sents, res_pos, res_pars
Esempio n. 8
0
def load_stanfordnlp_model(path, lang='en'):
    nlp = stanfordnlp.Pipeline(lang=lang,
                               use_gpu=False,
                               processors='tokenize,mwt,pos,lemma,depparse',
                               pos_batch_size=1000,
                               models_dir=path)
    return nlp
Esempio n. 9
0
    def __clean_text(self, df):
        config = {
            'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
            'lang': 'ru',  # Language code for the language to build the Pipeline in
            'tokenize_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tokenizer.pt',
            'pos_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tagger.pt',
            'pos_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt',
            'lemma_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_lemmatizer.pt',
            'depparse_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_parser.pt',
            'depparse_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt'
        }

        snlp = stanfordnlp.Pipeline(**config)
        nlp = StanfordNLPLanguage(snlp)

        text_list = df["Text"].values
        lower_text_list = []
        for text in text_list:
            text_lower = text.lower()
            lower_text_list.append(text_lower)
        clean_text_list = []
        for text in lower_text_list:
            text = nlp(text)
            token = [token.lemma_ for token in text if not (token.is_punct or token.is_stop)]
            clean_text_list.append(token)

        return clean_text_list
Esempio n. 10
0
def truecase(text):
    # Tokenize the text by sentences
    sentences = sent_tokenize(text, language='english')
    # Capitalize each sentence
    sentences_capitalized = [s.capitalize() for s in sentences]
    # Join the sentences back
    capitalized_text = re.sub(" (?=[\.,'!?:;])", "",
                              ' '.join(sentences_capitalized))
    # Create a stanfordnlp pipeline by the following processors
    stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos')
    # Process the text
    doc = stf_nlp(capitalized_text)
    # Capitalize the words if the word is the following parts of speech
    lst = [
        w.text.capitalize() if w.upos in ["PROPN", "NNS"] else w.text
        for sent in doc.sentences for w in sent.words
    ]
    # Join the list of words
    pos_capitalized = ' '.join(lst)
    # Replace i, i'll, i'm, i've with the capitalized variants
    for pat, repl in replacements:
        pos_capitalized = re.sub(pat, repl, pos_capitalized)
    # Remove the spaces between the punctuation
    result = re.sub(r'\s+([?.!"\'])', r'\1', pos_capitalized)
    return result
Esempio n. 11
0
def test_pretokenized():
    nlp = stanfordnlp.Pipeline(**{'processors': 'tokenize', 'models_dir': '.', 'lang': 'en',
                                  'tokenize_pretokenized': True})
    doc = nlp(EN_DOC_PRETOKENIZED)
    assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    doc = nlp(EN_DOC_PRETOKENIZED_LIST)
    assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
Esempio n. 12
0
def test_spacy_stanfordnlp(lang, models_dir):
    snlp = stanfordnlp.Pipeline(lang=lang, models_dir=models_dir)
    nlp = StanfordNLPLanguage(snlp)
    assert nlp.lang == "stanfordnlp_" + lang

    doc = nlp("Hello world! This is a test.")

    # fmt: off
    assert [t.text for t in doc
            ] == ["Hello", "world", "!", "This", "is", "a", "test", "."]
    assert [t.lemma_ for t in doc
            ] == ["hello", "world", "!", "this", "be", "a", "test", "."]
    assert [t.pos_ for t in doc] == [
        "INTJ", "NOUN", "PUNCT", "DET", "VERB", "DET", "NOUN", "PUNCT"
    ]
    assert [t.tag_
            for t in doc] == ["UH", "NN", ".", "DT", "VBZ", "DT", "NN", '.']
    assert [t.dep_ for t in doc] == [
        "root", "vocative", "punct", "nsubj", "cop", "det", "root", "punct"
    ]
    assert [t.is_sent_start
            for t in doc] == [True, None, None, True, None, None, None, None]
    assert any([t.is_stop for t in doc])
    # fmt: on
    assert len(list(doc.sents)) == 2
    assert doc.is_tagged
    assert doc.is_parsed
    assert doc.is_sentenced

    docs = list(nlp.pipe(["Hello world", "This is a test"]))
    assert docs[0].text == "Hello world"
    assert [t.pos_ for t in docs[0]] == ["INTJ", "NOUN"]
    assert docs[1].text == "This is a test"
    assert [t.pos_ for t in docs[1]] == ["DET", "VERB", "DET", "NOUN"]
Esempio n. 13
0
def _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp):
    if model_or_lang is None:
        model_or_lang = 'en' if use_stanfordnlp else 'en_core_web_sm'

    nlp = None
    if use_stanfordnlp:
        from spacy_stanfordnlp import StanfordNLPLanguage
        import stanfordnlp

        snlp = stanfordnlp.Pipeline(lang=model_or_lang,
                                    tokenize_pretokenized=is_tokenized)
        nlp = StanfordNLPLanguage(snlp)
    else:
        # Init model:
        # Initialize model, with custom pipe
        # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers'
        nlp = spacy.load(model_or_lang)
        if is_tokenized:
            nlp.tokenizer = nlp.tokenizer.tokens_from_list
        if disable_sbd:
            nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser')

    conllformatter = ConllFormatter(nlp)
    nlp.add_pipe(conllformatter, last=True)

    return nlp
Esempio n. 14
0
def conll_parse(sentences, file_path):

    config = {
        'processors': 'tokenize,lemma,pos,depparse',
        'tokenize_pretokenized': True
    }

    stanford_parser = stanfordnlp.Pipeline(**config)
    n_steps = len(sentences)

    import time
    start_time = time.time()

    with open(str(file_path), 'w') as f:
        for i, sentence in enumerate(list(sentences)):
            doc_obj = stanford_parser(sentence)

            f.write(doc_obj.conll_file.conll_as_string())

            if i % 100 == 0:
                completion = (i + 1) / n_steps
                comp_time = time.time() - start_time
                print(
                    f'{completion*100:.1f}% complete. Elapsed time: {comp_time:.2f}s'
                )

    print(f'Finished. Total computation time: {time.time() - start_time}')
Esempio n. 15
0
def demo_nlp():
    global start_state
    print("StanfordNLP")
    print("Please stand by while we initialize the StanfordNLP pipeline...")
    global lang_model_dir
    pipeline = stanfordnlp.Pipeline(models_dir=lang_model_dir, lang='en', use_gpu=False)
    print()
    print("Enter sentence(s) to parse, or just press Return to exit.")
    while True:
        inp = input("Input: ")
        if inp == "": return
        if inp == "PLOT":
            plot.plot.present(start_state)
            continue

        inp = preprocess(inp)
        doc = pipeline(inp)

        goal = Requirements(start_state)
        for i in range(0, len(doc.sentences)):
            cmd = patterns.handle_input(doc.sentences[i])
            get_requirement(cmd, goal)

        take_actions(goal)

        if recent_objspec:
            print("recent_objspec:", recent_objspec)
Esempio n. 16
0
 def __init__(self):
     self.nlp = stanfordnlp.Pipeline(lang="id",use_gpu=False, silent=True)
     self.stemmer = StemmerFactory().create_stemmer()
     self.ner = get_entities
     # Set POS Tagger 
     self.pos_tagger = nltk.tag.CRFTagger()
     self.pos_tagger.set_model_file('pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')
Esempio n. 17
0
 def __init__(self):
     self.tools = []
     self._n = 0
     self.pk_seg = pkuseg.pkuseg()
     self.thu_seg = thulac.thulac(seg_only=True)
     self.stanford_pipeline = stanfordnlp.Pipeline(processors='tokenize',
                                                   lang='zh')
Esempio n. 18
0
def pos_tagger(bundled_data, threshold):

    # combine list of sentences
    reviews = []
    for review in bundled_data:
        reviews.extend(review)

    # parser
    parser = stanfordnlp.Pipeline(processors="tokenize,mwt,lemma,pos")

    # extract sentences from reviews
    sentences_list = []
    for review in reviews:
        doc = parser(review)
        sentences_list.extend([sentence for sentence in doc.sentences])

    sampled_sentences = random.sample(sentences_list, threshold)

    parsed_texts = []
    for sentence in sampled_sentences:
        parsed_text = {}
        for i in range(len(sentence.words)):
            parsed_text[i] = (sentence.words[i].text, sentence.words[i].xpos)
        parsed_texts.append(parsed_text)

    with open("out/d_pos_tagged.json", "w") as json_file:
        json.dump(parsed_texts, json_file, indent=4, sort_keys=True)
Esempio n. 19
0
def mrr_pairs(generated_captions_fn, occurrences_dir, heldout_pairs, split): # TODO
    with open(generated_captions_fn) as f:
        generated_captions = json.load(f)
    id2captions = {meta['image_id']: meta['captions'] for meta in generated_captions}

    config = {'use_gpu': False, 'tokenize_pretokenized': True}
    nlp_pipeline = stanfordnlp.Pipeline(lang='en', models_dir=STANFORDNLP_DIR, **config)

    mrr_scores = {}
    for pair in heldout_pairs:
        occurrences_fn = os.path.join(occurrences_dir, pair + ".json")
        occurrences_data = json.load(open(occurrences_fn, "r"))

        _, val_indices, test_indices = get_occurrences_splits([occurrences_fn])
        if split == "val2014":
            eval_indices = test_indices
        else:
            eval_indices = val_indices

        nouns = set(occurrences_data[NOUNS])
        if ADJECTIVES in occurrences_data:
            others = set(occurrences_data[ADJECTIVES])
            concept_type = "adj-noun"
        elif VERBS in occurrences_data:
            others = set(occurrences_data[VERBS])
            concept_type = "verb-noun"
        else:
            raise ValueError("No adjectives or verbs found in occurrences data!")
        mrr_score = calc_mrr(id2captions, eval_indices, nouns, others, concept_type, occurrences_data, nlp_pipeline)

        pair = os.path.basename(occurrences_fn).split(".")[0]
        mrr_scores[pair] = mrr_score
    return mrr_scores
Esempio n. 20
0
def making_parsed_tree(sentiment_code, file_name):
    splited_sentence_first = []
    parsed_sentence_first = []

    pcn = StanfordCoreNLP('http://*****:*****@", '', text)
        text = re.sub(r'http\S+', '', text)
        return text

    for a in tqdm(range(len(df_amazon))):
        tweet_txt = about_symbol(text[a])
        if label[a] == sentiment_code:
            if len(tweet_txt) > 3:
                tweet_txt = " ".join(tweet_txt.split())
                tweet_txt = contractions.fix(tweet_txt)

                doc = nlp(tweet_txt)
                splited_sentence_second = []
                parsed_sentence_second = []

                for sentence in doc.sentences:
                    temp = []
                    for token in sentence.tokens:
                        temp.append(token.text)
                    sum_text = " ".join(temp)
                    sum_text = about_symbol(sum_text)
                    output = pcn.annotate(sum_text,
                                          properties={
                                              'annotators': 'parse',
                                              'outputFormat': 'json'
                                          })
                    parsed_sent = output['sentences'][0]['parse']
                    parsed_sent = " ".join(parsed_sent.split())
                    parsed_sent = parsed_sent.replace('(', '<')
                    parsed_sent = parsed_sent.replace(')', '>')

                    parsed_sentence_second.append(parsed_sent)
                    splited_sentence_second.append(sum_text)
                    # print(parsed_sent)
                splited_sentence_first.append(splited_sentence_second)
                parsed_sentence_first.append(parsed_sentence_second)

            sent_json['splited_sentence'] = []
            sent_json['parsed_sentence'] = []
            sent_json['original_sentence'] = []
            sent_json['splited_sentence'].append(splited_sentence_first)
            sent_json['parsed_sentence'].append(parsed_sentence_first)
            sent_json['original_sentence'].append(tweet_txt)

    with open(file_name, 'w') as out_file:
        json.dump(sent_json, out_file, indent=4)
Esempio n. 21
0
def tokenizer(gene_texts, MAX_NB_WORDS):
    word_index = {}
    docs = []
    txt_count = 0
    index = 1
    if nlp is None:
        nlp = stanfordnlp.Pipeline(use_gpu=False)
    for text in gene_texts:
        txt_count += 1
        if txt_count % 100 == 0:
            print('[tokenized txt]:', txt_count)
        doc = nlp(text)
        docs.append(doc)
        txt_matrix = NLP.get_text_matrix(doc)  # doc matrix (array)
        # == process this text matrix
        for i in range(len(txt_matrix)):
            sent_arr = txt_matrix[i]
            for j in range(len(sent_arr)):
                token = sent_arr[j].lower()
                # add to word_index
                if len(word_index) < MAX_NB_WORDS:
                    if token in word_index.keys():
                        continue
                    else:
                        word_index[token] = index
                        index += 1
    return word_index, docs
def get_org_name(url):

    response = requests.get(url)
    if (response.status_code == 200):
        soup = BeautifulSoup(response.text, "html.parser")
        doc_text = soup.find_all(
            ['p', 'tr'])  # fixed code to extract text from tables as well.
        doc_text = ''.join([x.text for x in doc_text])
    doc = clean_text(doc_text, lower=False)

    nlp = spacy.load('en_core_web_sm')
    stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos')
    doc = doc[:250].split()
    doc = ' '.join([x for x in doc if x.lower().find('newswire') == -1])

    orgs = set()
    spacy_doc = ' '.join(doc.split()).strip()
    nlp_doc = nlp(spacy_doc)
    for ent in nlp_doc.ents:
        if (ent.label_ == 'ORG'):
            orgs.add(ent.text)

    sf_doc = stf_nlp(doc)

    for sent in sf_doc.sentences:
        for word in sent.words:
            if (word.xpos == 'NNP' and word.upos == 'PROPN'):
                orgs.add(word.text)

    return ', '.join(list(orgs))
Esempio n. 23
0
 def __init__(self) -> None:
     snlp = stanfordnlp.Pipeline(lang='en')  # stanfordnlp python pipeline
     self.nlp = StanfordNLPLanguage(snlp)  # spacy wraper for snlp
     conllformatter = ConllFormatter(self.nlp)
     self.nlp.add_pipe(conllformatter, last=True)
     self.detokenizer = MosesDetokenizer()
     self.vanila_preprocessor = PreprocessorBase()
Esempio n. 24
0
def lemmatize_texts(lemmatizer):
    entries = Entry.objects.filter(lemmatized='')

    if lemmatizer == 'stanford':
        texts = [(entry.text, entry.id) for entry in entries]
        snlp = stanfordnlp.Pipeline(lang='ru')
        nlp = StanfordNLPLanguage(snlp)
        for doc in tqdm.tqdm(
                nlp.pipe(texts,
                         batch_size=100,
                         as_tuples=True,
                         disable=["tagger", "parser", "pos", "depparse"])):
            id = doc[1]
            lemmatized = ' '.join([token.lemma_ for token in doc[0]])
            entry = Entry.objects.get(id=id)
            entry.lemmatized = lemmatized
            entry.save()
    if lemmatizer == 'mystem':
        m = Mystem()
        for entry in tqdm.tqdm(entries):
            lemmas = m.lemmatize(entry.text)
            lemmatized = ''.join(lemmas)
            entry = Entry.objects.get(id=entry.id)
            entry.lemmatized = lemmatized
            entry.save()
Esempio n. 25
0
def parseToconllu(data):
    data = data[:5]
    nlp = stanfordnlp.Pipeline(
        models_dir='E:/anaconda/setup/Lib/site-packages/stanfordnlp'
    )  # This sets up a default neural pipeline in English
    doc_text = ''
    for data_ in data:
        for i in range(len(data_['questions'])):
            if not data_['questions'][i]['input_text'][-1] == '?':
                data_['questions'][i]['input_text'] += '?'
            if not data_['questions'][i]['input_text'][0].isupper():
                data_['questions'][i]['input_text'] = list(
                    data_['questions'][i]['input_text'])
                data_['questions'][i]['input_text'][0] = data_['questions'][i][
                    'input_text'][0].upper()
                data_['questions'][i]['input_text'] = ''.join(
                    data_['questions'][i]['input_text'])
            doc_text += data_['questions'][i]['input_text'] + ' '
            if not data_['answers'][i]['input_text'][-1] == '.':
                data_['answers'][i]['input_text'] += '.'
            if not data_['answers'][i]['input_text'][0].isupper():
                data_['answers'][i]['input_text'] = list(
                    data_['questions'][i]['input_text'])
                data_['answers'][i]['input_text'][0] = data_['questions'][i][
                    'input_text'][0].upper()
                data_['answers'][i]['input_text'] = ''.join(
                    data_['questions'][i]['input_text'])
            doc_text += data_['answers'][i]['input_text'] + ' '


#        doc_text += '\n'
    print('start parsing...')
    doc = nlp(doc_text)
    doc.write_conll_to_file('example.conllu')
Esempio n. 26
0
 def __init__(self):
     file = open("assign4_reviews.txt", "r")
     result = []
     input_text = file.readlines()
     # stanfordnlp.download('en')
     nlp = stanfordnlp.Pipeline()
     for i in range(0, len(input_text) - 1):
         doc = nlp(input_text[i])
         midresult = doc.conll_file.conll_as_string()
         midresult = midresult.split('\n')
         last_PROPN = ''
         last_Adj = ''
         for x in midresult:
             x = x.split('\t')
             if (len(x) > 4 and x[3] != 'PUNCT'):
                 if (x[3] == 'PROPN' or x[3] == 'NOUN'):
                     last_PROPN = last_PROPN + " " + x[1]
                     splitted = last_PROPN.split(" ")
                     if len(splitted) > 2:
                         splitted = splitted[1:]
                     last_PROPN = " ".join(splitted)
                 if (x[3] == 'ADJ'):
                     if (last_PROPN != ' '):
                         result.append((last_PROPN, x[1]))
                         last_PROPN = ''
         self.result = result
         return
Esempio n. 27
0
def preprocess(models_dir, processors, extractive, cnn_dm_file, article_file,
               summary_file, output_train_files, output_test_files,
               split_ratio):
    """
    This function goes through the entire preprocessing pipeline resulting in files ready to be used as the input of
    the graph neural network.
    :param models_dir: The path to the stanfordnlp directory.
    :param processors: List of parsers to use. Options: tokenize, mwt, pos, lemma, depparse.
    :param extractive: Whether to make extractive summary or not. If true,the input file should contain the best ids.
    :param cnn_dm_file: This file should contain the articles and the summaries in a jsonl format.
    :param article_file: This file will contain every article graph.
    :param summary_file: This file will contain every summary graph.
    :param output_train_files: The paths to save the training files.
                               The first parameter is the training input, the second is the expected output.
    :param output_test_files: The paths to save the validation files.
                              The first parameter is the validation input, the second is the expected output.
    :param split_ratio: The ratio of data used for training vs validation.
    :return: None
    """
    from graph_transformations.preprocessor import main as stanford_preprocess
    if extractive:
        from graph_transformations.cnn_extractive_parser import main as cnn_process
    else:
        from graph_transformations.cnn_parser import main as cnn_process
    from graph_transformations.train_test_split import train_test_split
    import stanfordnlp

    if not os.path.exists(models_dir):
        stanfordnlp.download('en', resource_dir=models_dir)
    correct_processors = ["tokenize", "mwt", "pos", "lemma", "depparse"]

    incorrect = [i for i in processors if i not in correct_processors]
    if len(incorrect) != 0:
        raise ValueError(
            "The following processor values are incorrect: {}".format(
                incorrect))

    if not os.path.exists(cnn_dm_file):
        raise FileNotFoundError(
            "The input file is not found. {} not found".format(cnn_dm_file))

    pipeline = stanfordnlp.Pipeline(models_dir=models_dir,
                                    processors=processors)
    processed_file = "{}_processed.jsonl".format(
        os.path.splitext(cnn_dm_file)[0])
    stanford_preprocess(pipeline, cnn_dm_file, processed_file)

    dependency_file = "dep.jsonl"
    word_file = "words.jsonl"
    pos_file = "pos.jsonl"
    cnn_process(processed_file, article_file, summary_file, dependency_file,
                word_file, pos_file, dependency_file[:-1], word_file[:-1],
                pos_file[:-1])

    ratio = int(split_ratio) if split_ratio >= 1.0 else int(split_ratio * 100)
    train_test_split(article_file, output_train_files[0], output_test_files[0],
                     ratio)
    train_test_split(summary_file, output_train_files[1], output_test_files[1],
                     ratio)
Esempio n. 28
0
def test_missing_requirements():
    """
    Try to build several pipelines with bad configs and check thrown exceptions against gold exceptions.
    :return: None
    """
    # list of (bad configs, list of gold ProcessorRequirementsExceptions that should be thrown) pairs
    bad_config_lists = [
        # missing tokenize
        (
            # input config
            {
                'processors': 'pos,depparse',
                'models_dir': TEST_MODELS_DIR,
                'lang': 'en'
            },
            # 2 expected exceptions
            [{
                'processor_type': 'POSProcessor',
                'processors_list': ['pos', 'depparse'],
                'provided_reqs': set([]),
                'requires': set(['tokenize'])
            }, {
                'processor_type': 'DepparseProcessor',
                'processors_list': ['pos', 'depparse'],
                'provided_reqs': set([]),
                'requires': set(['tokenize', 'pos'])
            }]),
        # no pos when lemma_pos set to True
        (
            # input config
            {
                'processors': 'tokenize,mwt,lemma',
                'models_dir': TEST_MODELS_DIR,
                'lang': 'en',
                'lemma_pos': True
            },
            # 1 expected exception
            [{
                'processor_type': 'LemmaProcessor',
                'processors_list': ['tokenize', 'mwt', 'lemma'],
                'provided_reqs': set(['tokenize', 'mwt']),
                'requires': set(['tokenize', 'pos'])
            }])
    ]
    # try to build each bad config, catch exceptions, check against gold
    pipeline_fails = 0
    for bad_config, gold_exceptions in bad_config_lists:
        try:
            stanfordnlp.Pipeline(**bad_config)
        except PipelineRequirementsException as e:
            pipeline_fails += 1
            assert isinstance(e, PipelineRequirementsException)
            assert len(e.processor_req_fails) == len(gold_exceptions)
            for processor_req_e, gold_exception in zip(e.processor_req_fails,
                                                       gold_exceptions):
                # compare the thrown ProcessorRequirementsExceptions against gold
                check_exception_vals(processor_req_e, gold_exception)
    # check pipeline building failed twice
    assert pipeline_fails == 2
Esempio n. 29
0
    def __init__(self):
        self.translator = Translator()
        self.nlp_de = stanfordnlp.Pipeline(lang='de')
        self.nlp_en = stanfordnlp.Pipeline(lang='en')
        self.prep_akk = ['bis', 'durch', 'für', 'gegen', 'ohne', 'um']
        self.prep_dat = [
            'aus', 'ausser', 'bei', 'nach', 'mit', 'seit', 'von', 'zu'
        ]
        self.prep_acc_dat = [
            'an', 'auf', 'hinter', 'in', 'neben', 'über', 'unter', 'von',
            'zwischen'
        ]

        self.nominative = {'Masc': 'der', 'Fem': 'die', 'Neut': 'das'}
        self.accusative = {'Masc': 'den', 'Fem': 'die', 'Neut': 'das'}
        self.dative = {'Masc': 'dem', 'Fem': 'der', 'Neut': 'den'}
        self.genitive = {'Masc': 'des', 'Fem': 'der', 'Neut': 'des'}
Esempio n. 30
0
def norm_seeds(lst, lang="da"):
    nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma', lang=lang)
    seeds = " ".join(lst)
    doc = nlp(seeds)
    seeds = [
        word.lemma.lower() for sent in doc.sentences for word in sent.words
    ]
    return sorted(list(set(seeds)))