Example #1
0
def tree2brackets(tree):
	str, tag = '', ''
	for item in tree2conlltags(tree):
		if item[2][0] in {'B', 'O'} and tag:
			str += tag +'] '
			tag = ''

		if item[2][0] == 'B':
			tag = item[2].split('-')[1]
			str += '['
		str += item[0] +' '

	if tag:
		str += tag +'] '

	return str.strip()
def generate_candidate(texts, method='word', remove_punctuation=False):
    """
    Generate word candidate from given string

    Parameters
    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence) # remove punctuation
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
    tagged_words = pos_tag_sents(words_) # POS tagging

    if method == 'word':
        tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
        tagged_words = chain.from_iterable(tagged_words)
        for word, tag in tagged_words:
            if tag in tags and word.lower() not in stop_words:
                candidates.append(word)
    elif method == 'phrase':
        grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
        chunker = RegexpParser(grammar)
        all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
        for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
            candidate = ' '.join([word for (word, pos, chunk) in group])
            if key is True and candidate not in stop_words:
                candidates.append(candidate)
    else:
        print("Use either 'word' or 'phrase' in method")
    return candidates
Example #3
0
 def extract_keyphrases(self, document):
     """
     For a document, parse sentences using our chunker created by
     our grammar, converting the parse tree into a tagged sequence.
     Yields extracted phrases.
     """
     for sents in document:
         for sent in sents:
             sent = self.normalize(sent)
             if not sent: continue
             chunks = tree2conlltags(self.chunker.parse(sent))
             phrases = [
                 " ".join(word for word, pos, chunk in group).lower()
                 for key, group in groupby(
                     chunks, lambda term: term[-1] != 'O'
                 ) if key
             ]
             for phrase in phrases:
                 yield phrase
Example #4
0
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
    """
    Extracts key chunks based on a grammar for a list of tokenized sentences.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)
    chunker    = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Parse with the chunker if we have a tagged sentence
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract candidate phrases from our parsed chunks
        chunks = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda (word, pos, chunk): chunk != 'O'
            ) if key
        ]
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conllstr, conllstr2tree, conlltags2tree, tree2conlltags
import nltk

text = "Fly me from Seattle to Tampa"
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
ner_tree = ne_chunk(tagged_tokens)
print(ner_tree)
iob_tagged = tree2conlltags(ner_tree)
print(iob_tagged)
Example #6
0
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


sent1 = preprocess(ex1)
sent2 = preprocess(ex2)

cs1 = cp.parse(sent1)
cs2 = cp.parse(sent2)
print(cs1)
print(cs2)

iob_tagged1 = tree2conlltags(cs1)
pprint(iob_tagged1)

iob_tagged2 = tree2conlltags(cs2)
pprint(iob_tagged2)

ne_tree1 = nltk.ne_chunk(pos_tag(word_tokenize(ex1)))
print(ne_tree1)

ne_tree2 = nltk.ne_chunk(pos_tag(word_tokenize(ex2)))
print(ne_tree2)

doc1 = nlp(ex1)
print('Named Entities for scentence1:')
pprint([(X.text, X.label_) for X in doc1.ents])
Example #7
0
 def chunkparser(self, pattern='NP: {<DT>?<JJ>*<NN>}'):
     cp = nltk.RegexpParser(pattern)
     cs = cp.parse(self.sent)
     iob_tagged = tree2conlltags(cs)
     self.iob_tagged = iob_tagged
Example #8
0
def fn_preprocess(art):
    art = nltk.word_tokenize(art)
    art = nltk.pos_tag(art)
    return art


art_processed = fn_preprocess(article)

results = ne_chunk(art_processed)

# for x in str(results).split('\n'):
#     if '/NN' in x:
#         print(x)

pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(art_processed)
# print(cs)

iob_tagged = tree2conlltags(cs)
# pprint(iob_tagged)

namedEntities = []
for word, pos, ner in iob_tagged:
    namedEntities.append(ner)
#     print(word, pos, ner)

print('Named Entites in Document')
print(len(namedEntities))
    sent = ""
    labels = []
    # try:
    for word in tsvin:
        word = word.split("\t")
        word = [w.replace("\n", "") for w in word]

        if word[0] == '':
            splitted = sent.split(" ")
            splitted = [str.strip(w) for w in splitted]
            # splitted = [re.sub('[^A-Za-z0-9]+', '', w) for w in splitted]
            splitted = [w for w in splitted if len(w) >= 1]
            # print splitted
            X_test_final.append(
                sent2features((tree2conlltags(ne_chunk(pos_tag(splitted))))))
            y_test_final.append(labels)
            sent = ""
            labels = []
        else:
            # if len(word[0].split(" ")) > 1:
            # print word[0].split(" ")
            sent = sent + " " + str.strip(word[0])
            labels.append(word[1])
    # except:
    #     print

# with open(CONST_WIKI_ALL,'rb') as tsvin, open('new.csv', 'wb') as csvout:
#     tsvin = csv.reader(tsvin, delimiter='\t')
#     for word in tsvin:
#         print word
Example #10
0
def find_elements(text,
                  full=False,
                  trim=True,
                  low_trim_limit=2,
                  high_trim_limit=2000):
    sent = nltk.pos_tag(nltk.word_tokenize(text))
    elements = dict()
    if full:  #do all nouns
        for x in sent:
            if x[1] == "NN" or x[1] == "NNS" or x[1] == "NNP" or x[
                    1] == "NNPS" or x[1] == "PRP":
                elements[x[0].lower()] = 0
    else:  #do only NE + extra
        for x in sent:
            if x[1] == "PRP":
                elements[x[0].lower()] = 0

        pattern = 'NP: {<DT>?<JJ>*<NN>}'
        cp = nltk.RegexpParser(pattern)
        cs = cp.parse(sent)
        iob_tagged = tree2conlltags(cs)
        parsed = pformat(iob_tagged)
        parsed = ast.literal_eval(parsed)
        tempString = ""
        for x in parsed:
            if x[2] == 'B-NP' or x[2] == 'I-NP':
                tempString = tempString + x[0].lower() + " "
            if x[2] == 'O' and len(tempString) > 0:
                tempString = tempString.rstrip()
                tempString = re.sub(
                    "^a ", "",
                    re.sub(
                        "^an ", "",
                        re.sub(
                            "^no ", "",
                            re.sub("^this ", "",
                                   re.sub("^the ", "", tempString)))))
                elements[tempString.rstrip().lower()] = 0
                tempString = ""

        doc = nlp(text)
        parsed = pformat([(X.text, X.label_) for X in doc.ents])
        parsed = ast.literal_eval(parsed)
        for x in parsed:
            if x[1] == 'PERSON' or x[1] == 'ORG' or x[1] == 'PRODUCT' or x[
                    1] == 'LOC' or x[1] == 'FAC':
                tempString = x[0].lower().replace('a ', '').replace(
                    'an ',
                    '').replace('no ',
                                '').replace('this ',
                                            '').replace('the ',
                                                        '').replace('\n', '')
                elements[tempString] = 0

    if trim:
        text = text.lower()
        for x in elements.keys():
            elements[x] = my_count(text, x)
        elements = {k: v for k, v in elements.items() if v > low_trim_limit}
        elements = {k: v for k, v in elements.items() if v <= high_trim_limit}

    pprint(elements)
    return elements
Example #11
0
 def __init__(self, train_sents, *args, **kwargs):
     tag_sents = [tree2conlltags(sent) for sent in train_sents]
     train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
     self.tagger = ConsecutiveNPChunkTagger.train(train_chunks, *args, **kwargs)
 def tag_bio(self): 
   #print(ne_chunk(pos_tag(self._ts_abs_word_tokens))) 
   iob_tagged = tree2conlltags(ne_chunk(pos_tag(self._ts_abs_word_tokens)))
   print(iob_tagged)
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
sentence = '''
        Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very
        close to the Manhattan Bridge which is visible from the window.
        '''
print(tree2conlltags(ne_chunk(pos_tag(word_tokenize(sentence)))))
        }

        return list(features.values())
 
 
if __name__ == "__main__":

    # transformed = [list(map(lambda x: ((x[0], x[1]), x[2]), s)) for s in chunked_sents]
    # random.shuffle(transformed)
    # train_sents = transformed[:int(len(transformed) * 0.9)]
    # test_sents = transformed[int(len(transformed) * 0.9 + 1):]

    # from nltk.stem.snowball import SnowballStemmer

    file_path = sys.argv[1]
    chunked_sents = [tree2conlltags(chunk.conllstr2tree(s)) for s in open(file_path).read().strip().split("\n\n")]
    random.shuffle(chunked_sents)
    train_sents = []#chunked_sents[:int(len(chunked_sents) * 0.7)]
    test_sents = chunked_sents[int(len(chunked_sents) * 0.7 + 1):]

    ### CRF Chunker

    chunker = CRFChunkParser(chunked_sents=train_sents, model_file="russian_chunker.crf")
    print(chunker.evaluate([conlltags2tree(s) for s in test_sents]))


    # from nltk.tag.crf import CRFTagger
    # chunker = CRFTagger(feature_func=feature_detector)

    # chunker.set_model_file("russian_chunker.crf")
    # chunker.train(train_sents, "russian_chunker.crf")
Example #15
0
    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    # print list
    for x in unique_list:
        print(x)


##### Processing on EBS Input Data file to extract dump of unique keywords

iob_tagged = []
for i in range(0, len(df['PMHD_TA004_SYS_T_DES'])):
    #print(i)
    ne_tree = ne_chunk(pos_tag(word_tokenize(df['PMHD_TA004_SYS_T_DES'][i])))
    iob_tagged.append(tree2conlltags(ne_tree))

s1 = []
for i in range(0, len(iob_tagged)):
    s1.append([i[0] for i in iob_tagged[i]])

s2 = []
for i in range(0, len(iob_tagged)):
    s2.append([i[1] for i in iob_tagged[i]])

s3 = []
for i in range(0, len(iob_tagged)):
    s3.append([i[2] for i in iob_tagged[i]])

s01 = []
for i in range(0, len(s1)):
 def chunking(sentence):
     from nltk.chunk import conlltags2tree, tree2conlltags
     iob_tagged = tree2conlltags(sentence)
     chunked_tree =conlltags2tree(iob_tagged)
     return chunked_tree
Example #17
0
def namedEntityRecognition(pos):
    chunked_token = ne_chunk(pos)
    named_entity = tree2conlltags(chunked_token)
    return named_entity
def rm_breaks(text, beta):

    #Convert to lower
    text = text.lower()
    #Remove commas
    text = text.replace(',', '')
    #Remove DOIs
    text = re.sub(r'\d+\.\d+/\w+', '', text)
    text = re.sub(r'doi:*', '', text)
    #Replace 'hypothesis 1' with 'h1'
    text = text.replace('hypotheses', 'hypothesis')
    text = re.sub(r'hypothesis (?=\d+)', 'h', text)
    #Remove numbers that dont have a character immediately before them (since H0 indicates hypothesis)
    text = re.sub(r'\W+\d+', '', text)
    text = re.sub(r'\d{2,4}', '', text)
    #Replace jstor link with 'jstor' placeholder, then delete
    text = re.sub(r'https?://.+', 'jstor.', text)
    # text = re.sub(r'\S+\.jstor\.\S+', 'jstor.', text))
    text = re.sub(r'\.{2,}|:', '', text)
    text = re.sub(r'this\scontent.+', '', text)
    #Delete jstor placeholder
    check = re.sub(r'.*jstor.*', '', text)
    if check != '':
        text = re.sub(r'.*jstor.*', '', text)
    #Remove word interruptions
    text = re.sub(r'-\s*\n\s*', '', text)
    #Remove line breaks
    text = re.sub(r'\n', '', text)
    #Do NER and remove sentences with too many named entities
    sent = preprocess(text)
    pattern = 'NP: {<DT>?<JJ>*<NN>}'
    cp = nltk.RegexpParser(pattern)
    cs = cp.parse(sent)
    iob_tagged = tree2conlltags(cs)
    Owords = 0
    wordCount = 0
    maintext = []
    holder = []
    for i in iob_tagged:
        holder.append(i[0])
        wordCount += 1
        if i[0] == '.':
            score = Owords / wordCount
            if 'hypothesis' in holder:
                maintext += holder
                holder.clear()
            elif 'jstor' in holder:
                Owords = 0
                wordCount = 0
                holder.clear()
                continue
            elif score >= beta:
                maintext += holder
            Owords = 0
            wordCount = 0
            holder.clear()
        if i[2] == 'O':
            Owords += 1
    if maintext != []:
        maintext = functools.reduce(lambda a, b: a + ' ' + b, maintext)
    return maintext
Example #19
0
def senten_tag(sentence):
    ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
    iob_tagged = tree2conlltags(ne_tree)
    return iob_tagged
Example #20
0
        if(data is None or data['message'] is None or data['message'] == ""):
            continue
        msg = str(data['message'])
        msg = msg.strip()
        if(len(msg) == 0 or  english_ch.search(msg) == None):
            continue
        tokens = nltk.word_tokenize(msg)
        tokens = remove_blanc(tokens)
        tokens = remove_special(tokens)

        flag = "FALSE"
        for t in tokens:
            if(d.check(t)):
                flag = "TRUE"
                break
        x = tree2conlltags(ne_chunk(pos_tag(word_tokenize(msg))))
        nerf  = "N"
        for i in x:
            if(len(i) > 2 and not ("B-" in i[2] or "I-" in i[2] )):
                nerf  = "S"
                break
        if(flag == "TRUE" or nerf == "N"):
            if(date[0] not in transactions_date_wise):
                transactions_date_wise[date[0]] = 0
            transactions_date_wise[date[0]] = transactions_date_wise[date[0]] + 1
            textual_transactions = textual_transactions + 1
    except:
        continue        
f.close()

outputfile.write("DATE #TEXTUAL_TRANSACTIONS \n")
Example #21
0
File: parser.py Project: zemlni/GAT
def IOB_Tagging(t):
    iob_tagged = tree2conlltags(t)
    return iob_tagged
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import stopwords
import os
import re

filepath = "Enter file path"
fin = open(filepath, 'r')
fout =  open('out.txt', 'w' )
text = fin.read()
text = re.sub(r'[^\w\s]',' ',text)        
sentence=sent_tokenize(text)
for x in sentence:
    words=word_tokenize(x)
    tagged_pos=pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged_pos, binary=False)
    ne_tagged=(tree2conlltags(namedEnt))
    for ne in ne_tagged:
        ner=(ne[-1])
        ner1=str(ner)
    for tag in range(3):
        if tag == 0:
            gram = ("Nametag: {(<VBP>).*?(<JJ>?<NNP>+|<NNP>+)}")
        if tag == 1:
            gram = ("Datetag: {<CD><CD|JJ><CD>}")
        if tag == 2:
            gram = ("Qualificationtag: {<NNP>+<IN.*><NNP>} ")
            
        chunkParser = nltk.RegexpParser(gram)
        tree = chunkParser.parse(tagged_pos)
        iob_tagged=(tree2conlltags(tree))
        for iob in iob_tagged:
Example #23
0
ts = " Agent Name david member number 45678"
ts3 = "123467 is  davids member no"
ts2 = " mark and john are working at Google"

test = (nltk.pos_tag(word_tokenize(ts)),
        np.where(model.encode([" ".join(ts)]) >= 0, 'p',
                 'n').astype('|S1').tostring().decode('utf-8'))
print("POS tags output by nltk")
print(test[0])
#test = [('member',) ,('number', ),('is',), ('9860300',)]
X_test = extract_features(test)
ans = fcrf.predict_single(X_test)
print(ts)
print("NER tags recognized by CRF")
print(ans)
# compare ner tags output by stanford ner, nltk and spaCy
tokenized_text = word_tokenize(ts)
ner_st = nerst.tag(tokenized_text)
print("stanford ner tags")
print(ner_st)
pos_nltk = nltk.pos_tag(tokenized_text)
print("nltk tags")
print(pos_nltk)
print(tree2conlltags(ne_chunk(pos_nltk)))
nlp = spacy.load('en_core_web_sm')
doc = nlp(ts)
print("spacy ner tags")
print([(X.text, X.label_) for X in doc.ents])

exit()
Example #24
0
text1 = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'
text2="Please advise on the options the deceased clients wife has in relation to this pension" \
     "   She wishes to exercise ARF option if available "
text="Hi I was trying to register online but I was n t recognised " \
     "  My  France number is 4824461      " \
     "Looking to register on Pension Planet Robert Manning" \
     "   but Irish Ronnie Gardner website ca n t find my details        " \
     "Richard Wade "
text = 'How can I pay my car renewal'
tokenized_text = word_tokenize(text)
ner_st = st.tag(tokenized_text)
print(ner_st)

pos_st = post.tag(tokenized_text)
print(pos_st)
exit()
pos_nltk = nltk.pos_tag(tokenized_text)
print(pos_nltk)

blob = TextBlob(text)
print(blob.tags)
print("tree stanford\n")
print("type of chunk", type(ne_chunk(pos_st)))

print("type of tree", len(tree2conlltags(ne_chunk(pos_st))))
print("tree nltk\n")
print(tree2conlltags(ne_chunk(pos_nltk)))
print("tree blob\n")
print(ne_chunk(pos_nltk))
print(tree2conlltags(ne_chunk(blob.tags)))
exit()
# Prepare and print metrics for the normal metrics
OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True)
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))
# -

# An example of a user fed definition
chunked = chunker.parse(pos_tag(word_tokenize(Def[0])))
D =list(filter(lambda x: isinstance(x, nltk.tree.Tree), chunked))[0]
' '.join([d[0] for d in D])

art = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml')
p_lst = [px.recutext_xml(p) for p in art.tag_list(tag='para')] 
p_vec = count_vect.transform(p_lst)
preds = clf.predict(p_vec)

for k,p in enumerate(p_lst):
    print(k,preds[k],p[:100])
    print('------')

chunk = tree2conlltags(chunker.parse(pos_tag(word_tokenize(p_lst[63]))))
for tok in chunk:
    print('{:15} {:>10} '.format(tok[0], tok[2]))

with open('../PickleJar/chunker.pickle', 'wb') as chunker_f:
    pickle.dump(chunker, chunker_f)

with open('data/vectorizer.pickle', 'wb') as token_f:
    pickle.dump(, token_f)
def write(filename, predictor):
    sentence = read_sentence(filename)
    for s in sentence:
        sentence_list, label_list = process_sentence(s)
        sen = mergeWords(sentence_list)
        # print(sen)

        #####assign pos#############################################3
        pos_list = []
        # truple = tree2conlltags(ne_chunk(pos_tag(word_tokenize(sen))))
        truple = tree2conlltags(ne_chunk(pos_tag(sentence_list)))
        # the truple contains word, pos, ner-label
        for item in truple:
            pos_list.append(item[1])

        ################get words lemma and stem######################
        wordnet_lemmatizer = WordNetLemmatizer()
        lemma_list = []
        for word in sentence_list:
            lemma_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

        stem_list = []
        lancaster = LancasterStemmer()
        for word in sentence_list:
            stem_list.append(lancaster.stem(word))
        # print(stem_list)

        #####assign consituency parent pos############################
        pos_parent_list, right_sublings_list, chunk_position, left_sublings_list = parse_consituency_tree(
            sentence_list, predictor)
        # print("=========pos===")
        # print(len(sentence_list))
        # print(len(chunk_position))
        # 追加一行空行
        sentence_list.append(" ")
        label_list.append(" ")
        pos_list.append(" ")
        pos_parent_list.append(" ")
        right_sublings_list.append(" ")
        chunk_position.append(" ")
        lemma_list.append(" ")
        stem_list.append(" ")
        left_sublings_list.append(" ")

        data = {}
        data["word"] = sentence_list
        data["label"] = label_list
        data["pos"] = pos_list
        data["chunk"] = pos_list
        data["pos_parent"] = pos_parent_list
        data["right_sublings_list"] = right_sublings_list
        data["chunk_position"] = chunk_position
        data["lemma_list"] = lemma_list
        data["stem_list"] = stem_list
        data["left_sublings_list"] = left_sublings_list
        df = pd.DataFrame(data)

        # to_filename = "word.csv"
        # df.to_csv(to_filename)
        to_file = filename.split(".tsv")[0]
        to_file1 = to_file + "_feature_v1" + ".tsv"
        df.to_csv(to_file1,
                  sep='\t',
                  index=False,
                  header=False,
                  encoding="utf8",
                  mode='a')
Example #27
0
sentences = [
    "John is a man. He walks", "John and Mary are married. They have two kids",
    "In order for Ravi to be successful, he should follow John",
    "John met Mary in Barista. She asked him to order a Pizza"
]


def gender(word):
    return classifier.classify(feature(word))


for sent in sentences:
    chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)),
                           binary=False)
    stack = []
    print(sent)
    items = tree2conlltags(chunks)  #iob tagging
    for item in items:
        if item[1] == 'NNP' and (item[2] == 'B-PERSON' or item[2] == 'O'):
            stack.append((item[0], gender(item[0])))
        elif item[1] == 'CC':
            stack.append(item[0])
        elif item[1] == 'PRP':
            stack.append(item[0])
    print("\t {}".format(stack))

items

print(chunks)
    def generate_weather_answer(self, question):
        """
        Generate weather forecast for a selected city.
        
        At first try to extract city from user request. If not possible, then generate usual answer.
        Connects to openweathermap and gets forecast, then generates plot of temperature
        in Celsius and Fahrenheit; also shows unique weather conditions.
        """
        # remove previous image, as it isn't needed anymore
        for i in glob.glob(os.path.join(os.getcwd(), '*.png')):
            os.remove(i)

        good_symbols_re = re.compile('[^a-zA-Z -]')
        question_cleaned = good_symbols_re.sub('', question)

        # Extract entities.
        tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(question.title()))))
        cities = [i[0] for i in tagged if i[1] == 'NNP']
        city = ''
        for c in cities:
            data = requests.get('http://api.openweathermap.org/data/2.5/forecast?q={0}&appid=f00cf7123615727d162770891d4fd225'.format(c)).json()
            if data['cod'] == '200':
                city = c
                break
        if city == '':
               return self.generate_usual_answer(question)
        else:
            forecast = requests.get('http://api.openweathermap.org/data/2.5/forecast?q={0}&appid=f00cf7123615727d162770891d4fd225'.format(city)).json()
            if forecast['message'] == 'city not found':
                return "I don't know this city!"

            # Generate temperature and date lists for plotting
            date_list = []
            temp_list_c = []
            temp_list_f = []

            for reading in forecast['list']:
                date = datetime.fromtimestamp(int(reading['dt']))
                temperature_c = reading['main']['temp'] - 273.15
                temperature_f = reading['main']['temp'] * 9 / 5 - 459.67
                date_list.append(date)
                temp_list_c.append(temperature_c)
                temp_list_f.append(temperature_f)

            # make chart
            fig, ax = plt.subplots()
            ax.plot_date(date_list, temp_list_c, '-', label='Celsius')
            ax.plot_date(date_list, temp_list_f, '-', label='Fahrenheit')
            ax.grid(True)

            plt.xticks(rotation=30)
            plt.yticks(range(int(min(temp_list_c)) - 1, int(max(temp_list_f) + 1), 5))
            dtFmt = mdates.DateFormatter('%m/%d')
            ax.xaxis.set_major_formatter(dtFmt)
            plt.title('Temperature in {0}'.format(city))
            plt.legend()
            # save image, so it can be sent to user
            plt.savefig('plot.png')

            # List of possible unique weather conditions
            weather = ', '.join(list(set([i['weather'][0]['description'] for i in forecast['list']])))

            return 'Possible weather in the next few days: {0}.;{1}'.format(weather, 'plot.png')
Example #29
0
def getSyntaxInfo(sentence):
    tags = pos_tag(sentence.split())
    ne_tree = ne_chunk(tags)
    ne_tagged = tree2conlltags(ne_tree)
    syntax_info = []
    caps_range = set(range(ord('A'), ord('Z') + 1, 1))
    for i in range(len(tags)):
        tag = tags[i]
        ne_tag = ne_tagged[i][2]
        tag_no = tagset.index(tag[1])
        sentiment_score = [0, 0, 0]
        wordnetTag = getWordnetTag(tag[1])
        if wordnetTag is None:
            synset = wn.synsets(tag[0])
            if len(synset) == 0:
                synset = None
            else:
                synset = synset[0]
                sentiSynset = swn.senti_synset(synset.name())
                sentiment_score = [
                    sentiSynset.pos_score(),
                    sentiSynset.neg_score(),
                    sentiSynset.obj_score()
                ]
        else:
            synset = wn.synsets(tag[0], pos=wordnetTag)
            if len(synset) == 0:
                synset = None
            else:
                synset = synset[0]
                sentiSynset = swn.senti_synset(synset.name())
                sentiment_score = [
                    sentiSynset.pos_score(),
                    sentiSynset.neg_score(),
                    sentiSynset.obj_score()
                ]
        start_caps = int(ord(tag[0][0]) in caps_range)
        allcaps = 1
        for c in tag[0]:
            if ord(c) not in caps_range:
                allcaps = 0
                break
        is_number = 0
        try:
            n = float(tag[0])
            is_number = 1
        except:
            pass
        # for i in range(3):
        # 	sentiment_score[i]=sentiment_score[i]/0.25+4
        iob_tag = ne_tag[0]
        if ne_tag == 'O':
            ne_tag = ''
        else:
            ne_tag = ne_tag[2:]
        hypernyms = [synset]
        last_two_synsets = [None, None]
        same_synset = [0, 0]
        if synset is not None:
            while len(hypernyms[-1].hypernyms()) > 0:
                hypernyms.append(hypernyms[-1].hypernyms()[0])
            last_two_synsets = [hypernyms[-1].name(), None]
            same_synset[0] = int(last_two_synsets[0] == synset.name())
            if len(hypernyms) > 1:
                last_two_synsets[1] = hypernyms[-2].name()
                same_synset[1] = int(last_two_synsets[1] == synset.name())
        syntax_info.append([tag_no] + sentiment_score + [iob_tag, ne_tag] +
                           last_two_synsets + same_synset +
                           [start_caps, allcaps, is_number,
                            len(tag[0])])
    return syntax_info
import nltk
from nltk.chunk import conlltags2tree, tree2conlltags

sentence = 'Elon and Hawking met at SpaceX last Tuesday to discuss Artificial Intelligence'

try:
    tokenized_sentence = nltk.word_tokenize(sentence)
    tagged_sentence = nltk.pos_tag(tokenized_sentence)
    named_entity_tree = nltk.ne_chunk(tagged_sentence)
    iob_tagged = tree2conlltags(named_entity_tree)
    ne_tree = conlltags2tree(iob_tagged)
    for i in ne_tree:
        print(i)
except Exception as e:
    print(e)

 def __init__(self, train_sentences):
     train_data = [[(t, c) for w, t, c in tree2conlltags(sent)]
                   for sent in train_sentences]
     self.tagger = BigramTagger(train_data)
Example #32
0
            text = file.read()

        sentences = sentence_tokenizer.tokenize(text)
        persons = {}
        organizations = {}
        locations = {}
        geopolitical_entities = {}
        groups = {}
        facilities = {}
        multi_word = ''

        for sentence in sentences:

            tags = tagger.tag(tokenizer.tokenize(sentence))
            ne_tree_multiclass = multiclass_ner.parse(tags)
            iob_tagged_multiclass = tree2conlltags(ne_tree_multiclass)

            for current, next_value in zip(iob_tagged_multiclass, iob_tagged_multiclass[1:]):

                entity, category, next_entity, next_category = current[0], current[2], next_value[0], next_value[2]

                if 'B-' in category and next_category != 'O':
                    multi_word = entity
                    continue

                if 'I-' in category and next_category != 'O':
                    multi_word = multi_word + ' ' + entity
                    continue

                if 'I-' in category:
                    multi_word = multi_word + ' ' + entity
Example #33
0
def _conll(tokens):
    pos_tags = nltk.pos_tag(tokens)
    named_entities = ne_chunk(pos_tags)
    return [(x[0], x[2]) for x in tree2conlltags(named_entities)]
 def __init__(self):
     train_sents = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=['NP'])
     train_data = [[(t, c) for _, t, c in tree2conlltags(sent)] for sent in train_sents]
     unigram_tagger = nltk.UnigramTagger(train_data)
     self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
Example #35
0
from proper_nouns.funcs.utilities import parse_ner_counts
from proper_nouns.funcs.utilities import parse_census_counts
from proper_nouns.funcs.utilities import tokenize_string

download_required_nltk_packages()

all_census_names = get_all_census_names()

tags = ['B-PERSON', 'I-PERSON']
census = {'truth_names': 0, 'difference': 0, 'no_names': 0}
tagged = {'truth_names': 0, 'test_names': 0, 'test_minus_tagged': 0, 'tagged_minus_test': 0, 'no_names': 0}

n = 0
corpus = read_gmb_corpus('tags')

for tagged_tokens in corpus:
    sentence = ' '.join([iob[0] for iob in tagged_tokens])

    test_the_tokens = tokenize_string(sentence)
    ne_tree = ne_chunk(test_the_tokens)
    test_tagged_tokens = tree2conlltags(ne_tree)
    ner_counts = is_person_tagged(tagged_tokens, test_tagged_tokens, tags)
    parse_ner_counts(ner_counts, tagged)

    census_counts = people_in_census(tagged_tokens, all_census_names, tags)
    parse_census_counts(census_counts, census)

    n += 1
    if n % 2000 == 0:
        print_intermediate_results(n, tagged, census)
Example #36
0
#   (NP September/NNP)
#   ,/,
#   due/JJ
#   (PP for/IN)
#   (NP release/NN)
#   (NP tomorrow/NN)
#   ,/,
#   (VP fail/VB to/TO show/VB)
#   (NP a/DT substantial/JJ improvement/NN)
#   (PP from/IN)
#   (NP July/NNP and/CC August/NNP)
#   (NP 's/POS near-record/JJ deficits/NNS)
#   ./.)

from nltk.chunk import tree2conlltags
iob_tagged = tree2conlltags(chunked_sentence)
print(iob_tagged)

# [
#   ('Confidence', 'NN', 'B-NP'),
#   ('in', 'IN', 'B-PP'),
#   ('the', 'DT', 'B-NP'),
#   ('pound', 'NN', 'I-NP'),
#   ('is', 'VBZ', 'B-VP'),
#   ('widely', 'RB', 'I-VP'),
#   ('expected', 'VBN', 'I-VP'),
#   ('to', 'TO', 'I-VP'),
#   ('take', 'VB', 'I-VP'),
#   ('another', 'DT', 'B-NP'),
#   ('sharp', 'JJ', 'I-NP'),
#   ('dive', 'NN', 'I-NP'),
	def __init__(self, train_sents, *args, **kwargs):
		tag_sents = [tree2conlltags(sent) for sent in train_sents]
		train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
		self.tagger = ClassifierTagger.train(train_chunks, *args, **kwargs)
Example #38
0
 def __init__(self, trainingChunkedSents):
     trainingData = [
         [(posTag, bioTag) for word, posTag, bioTag in tree2conlltags(chunkedSent)]
         for chunkedSent in trainingChunkedSents 
     ]
     self.tagger = TrigramTagger(trainingData)
Example #39
0
__author__ = 'User'

""" conll2002 is in Duch and Spanish so its not woriking well with that """


from nltk.corpus import conll2002
from nltk.chunk import tree2conlltags
import pandas as pd
from evaluate import evaluate
from mit_ie.mitie_series_ner_extractror import mitie_extract_ner_series
from stanford_ner.stanford_series_ner_extractor import stanford_extract_ner_series


chunked_words = tree2conlltags(conll2002.chunked_words())
df = pd.DataFrame(chunked_words, columns=['word', 'tmp', 'real_tag'])

# remove tmp col
df = df.loc[:, ["word", "real_tag"]]

# strip first two chars - "B-..." and "I-..."
df['real_tag'] = map(lambda x: x[2:] if len(x) > 2 else x, df['real_tag'])

# testing
df = df[:5000]

df.real_tag = list(df.real_tag)
df.word = map(unicode, df.word)

# df = add_dataframe_ner_tags(corpus_df=df, ner_extractor=mitie_extract_ner_series)
#
# print('###### MIT IE NER evaluation #####')