Beispiel #1
0
 def __init__(self, content):
     self._content = content
     self._nlp = en_core_web_sm.load()
     self._processed = self._nlp(self._content)
     self._lemma = {}
     self._pos = {}
     self._pos_ = {}
     self._word = {}
     self._sentiment = {}
     self._tag = {}
     self._dep = {}
     self._prob = {}
     self._idx = {}
     self.process()
def nlp_init(tries=0):
    if tries > 0:
        print('Retrying, try%d' % tries)

    global loading_status
    if loading_status == 'none' and 'nlp' not in data:
        try:
            print('NER init...')
            loading_status = 'loading'
            data['nlp'] = english_model.load()
            loading_status = 'done'
        except Exception as exc:
            print(exc)
            loading_status = 'none'
    elif loading_status == 'loading' and tries < 30:
        time.sleep(10)
        if loading_status == 'loading':
            new_tries = tries + 1
            return nlp_init(tries=new_tries)

    return data['nlp']
Beispiel #3
0
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pprint import pprint
import re
import itertools
import contractions

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



nlp = en_core_web_sm.load()




L = pd.read_csv("lyrics.csv", index_col=0)


#dropnan values

L=L.dropna()

#removing end of sentences

L=L.replace({'\n': ' '}, regex=True)
Beispiel #4
0
 def __init__(self, stopWordsFilePath):
     self.nlp = en_core_web_sm.load()
     self.stop_words_path = stopWordsFilePath
     self.noOfClasses = ["Negative", "Positive"]
    def aqgParse(self, sentence):

        #nlp = spacy.load("en")
        nlp = en_core_web_sm.load()

        singleSentences = sentence.split(".")
        questionsList = []
        if len(singleSentences) != 0:
            for i in range(len(singleSentences)):
                segmentSets = singleSentences[i].split(",")

                ner = nerTagger(nlp, singleSentences[i])

                if (len(segmentSets)) != 0:
                    for j in range(len(segmentSets)):
                        try:
                            questionsList += clause.howmuch_2(segmentSets, j, ner)
                        except Exception:
                            pass

                        if identification.clause_identify(segmentSets[j]) == 1:
                            try:
                                questionsList += clause.whom_1(segmentSets, j, ner)
                            except Exception:
                                pass
                            try:
                                questionsList += clause.whom_2(segmentSets, j, ner)
                            except Exception:
                                pass
                            try:
                                questionsList += clause.whom_3(segmentSets, j, ner)
                            except Exception:
                                pass
                            try:
                                questionsList += clause.whose(segmentSets, j, ner)
                            except Exception:
                                pass
                            try:
                                questionsList += clause.what_to_do(segmentSets, j, ner)
                            except Exception:
                                pass
                            try:
                                questionsList += clause.who(segmentSets, j, ner)
                            except Exception:
                                pass
                            try:
                                questionsList += clause.howmuch_1(segmentSets, j, ner)
                            except Exception:
                                pass
                            try:
                                questionsList += clause.howmuch_3(segmentSets, j, ner)
                            except Exception:
                                pass


                            else:
                                try:
                                    s = identification.subjectphrase_search(segmentSets, j)
                                except Exception:
                                    pass

                                if len(s) != 0:
                                    segmentSets[j] = s + segmentSets[j]
                                    try:
                                        questionsList += clause.whom_1(segmentSets, j, ner)
                                    except Exception:
                                        pass
                                    try:
                                        questionsList += clause.whom_2(segmentSets, j, ner)
                                    except Exception:
                                        pass
                                    try:
                                        questionsList += clause.whom_3(segmentSets, j, ner)
                                    except Exception:
                                        pass
                                    try:
                                        questionsList += clause.whose(segmentSets, j, ner)
                                    except Exception:
                                        pass
                                    try:
                                        questionsList += clause.what_to_do(segmentSets, j, ner)
                                    except Exception:
                                        pass
                                    try:
                                        questionsList += clause.who(segmentSets, j, ner)
                                    except Exception:
                                        pass

                                    else:
                                        try:
                                            questionsList += nonClause.what_whom1(segmentSets, j, ner)
                                        except Exception:
                                            pass
                                        try:
                                            questionsList += nonClause.what_whom2(segmentSets, j, ner)
                                        except Exception:
                                            pass
                                        try:
                                            questionsList += nonClause.whose(segmentSets, j, ner)
                                        except Exception:
                                            pass
                                        try:
                                            questionsList += nonClause.howmany(segmentSets, j, ner)
                                        except Exception:
                                            pass
                                        try:
                                            questionsList += nonClause.howmuch_1(segmentSets, j, ner)
                                        except Exception:
                                            pass

                questionsList.append('\n')
        return questionsList
Beispiel #6
0
def retrieve_input():
    s = textBox.get("1.0", "end-1c")

    name = []
    ent = []
    nlp = en_core_web_sm.load()
    doc = nlp(s)
    for X in doc.ents:
        name.append(X.text)
        ent.append(X.label_)
    s = pos_tag(word_tokenize(s))
    ls = []
    ls2 = []
    for i in s:
        if i[1][0] == "W" or i[1][0] == "J" or i[1][0] == "D" or i[1][
                0] == "W" or i[1][0] == "M" or i[1][0] == "R":
            continue
        else:
            ls.append(i)
            ls2.append(i[0].lower())
    c = 1  ## index for dataframe
    if len(ent):
        for i in range(len(ent)):
            if ent[i] == 'GPE':
                df.iloc[c]["Team"] = name[i]
                if "win" in ls2:
                    df.iloc[c]["Team_win"] = 1
                    df.iloc[c]["Valid Before"] = "first Innings"
                elif "wickets" in ls2 or "wicket" in ls2:
                    df.iloc[c]["Team_wickets"] = 1
                    df.iloc[c]["Valid Before"] = 20
                elif "runs" in ls2 or "run" in ls2:
                    df.iloc[c]["Team_run"] = 1
                    df.iloc[c]["Valid Before"] = 20
                elif "boundaries" in ls2:
                    df.iloc[c]["Boundaries"] = 1
                    df.iloc[c]["Valid Before"] = 20
                elif "sixes" in ls2:
                    df.iloc[c]["Sixes"] = 1
                    df.iloc[c]["Valid Before"] = 20
            elif ent[i] == 'ORDINAL' or ent[i] == 'CARDINAL' or ent[
                    i] == 'DATE':
                if "over" in ls2 or "overs" in ls2:
                    if "." in name[i]:
                        s = name[i].split(".")
                        df.iloc[c]["Over"] = int(s[0])
                        df.iloc[c]["Ball"] = int(s[1])
                    df.iloc[c]["Over"] = int(name[i][0:-2])
            else:
                if "wickets" in ls2 or "wicket" in ls2:
                    df.iloc[c]["Bowler"] = name[i]
                    df.iloc[c]["Valid Before"] = 5
                else:
                    df.iloc[c]["Batsmen"] = name[i]
                    df.iloc[c]["Valid Before"] = 20
                    if "boundaries" in ls2:
                        df.iloc[c]["Boundaries"] = 1
                        df.iloc[c]["Valid Before"] = 20
                    elif "sixes" in ls2:
                        df.iloc[c]["Sixes"] = 1
                        df.iloc[c]["Valid Before"] = 20

    elif "wickets" in ls2 or "wicket" in ls2:
        df.iloc[c]["Team"] = "Both"
        df.iloc[c]["Team_wickets"] = 1
        df.iloc[c]["Valid Before"] = 20
    elif "win" in ls2:
        df.iloc[c]["Team"] = "Both"
        df.iloc[c]["Team_win"] = 1
        df.iloc[c]["Valid Before"] = 20
    elif "runs" in ls2 or "run" in ls2:
        df.iloc[c]["Team"] = "Both"
        df.iloc[c]["Team_run"] = 1
        df.iloc[c]["Valid Before"] = 20
    elif "boundaries" in ls2:
        df.iloc[c]["Boundaries"] = 1
        df.iloc[c]["Team"] = "Both"
        df.iloc[c]["Valid Before"] = 20
    elif "sixes" in ls2:
        df.iloc[c]["Sixes"] = 1
        df.iloc[c]["Team"] = "Both"
        df.iloc[c]["Valid Before"] = 20
    else:
        print("INVALID QUESTION")
Beispiel #7
0
    all_lectures = json.load(f)

# Days of the week
days = {
    "Mon": "Monday",
    "Tue": "Tuesday",
    "Wed": "Wednesday",
    "Thu": "Thursday",
    "Fri": "Friday",
    "Sat": "Saturday",
    "Sun": "Sunday"
}

# Lectures
lectures = {}
parser = en_core_web_sm.load()
for entry in all_lectures:
    lecture = all_lectures[entry]["name"]
    tree = parser(lecture)
    acronym = ""
    for token in tree:
        if (token.tag_ != "IN" and token.tag_ != "TO" and token.tag_ != "DT"
                and token.tag_ != "CC"):
            acronym = acronym + token.text[0].upper()
    lectures[acronym] = lecture

with open('lectures_acr.json', 'w') as fp:
    json.dump(lectures, fp)

with open('days_acr.json', 'w') as fp:
    json.dump(days, fp)
Beispiel #8
0
from flask import jsonify
from flask import Flask, request, send_from_directory, send_file
import en_core_web_sm
from dateparser import parse
# from nltk import word_tokenize, ngrams
import en_core_web_sm
import string
# from nltk.stem import WordNetLemmatizer
# from gensim.models import KeyedVectors
import random
import pdfrw
from nameparser import HumanName
# wordnet_lemmatizer = WordNetLemmatizer()

spacy_nlp = en_core_web_sm.load()

app = Flask(__name__, static_url_path='/documents')
# spacy_nlp = spacy.load('en')

# topics
# DISABILITY = 'DISABILITY'
# ILLNESS = 'ILLNESS'
# DEATH = 'DEATH'
# CHILDREN = 'CHILDREN'

# seed words for the word2vec model
# REASONS = {
#     'disability': DISABILITY,
#     'deaf': DISABILITY,
#     'disabled': DISABILITY,
#     'illness': ILLNESS,
def loadProdReviewData(PROD_DICT):
    global category1, category2, main_cat, cond_date
    rev_db_col = 'new_reviews2'
    review_cursor = mongo.searchInDB(mongoObj.getReviewKey(),
                                     db_col=rev_db_col)
    docCount = review_cursor.count()
    print("make product reviews feature from %s reviews..." % (docCount))
    print("Search reviews finished...")
    nlp = en_core_web_sm.load()

    with tqdm(total=docCount) as pbar:
        for i2, rev in enumerate(review_cursor):
            asin, review, overall, vote, summary, review_ID = \
             rev["asin"], rev["reviewText"], rev['overall'], rev['vote'], rev['summary'], str(rev['unixReviewTime'])
            if asin not in PROD_DICT.keys(): asin = 'UNK'

            if "REVIEW_ITEM_LIST" not in PROD_DICT[asin].keys():
                DATA_DICT = PROD_DICT[asin]
                DATA_DICT["REVIEW_ITEM_LIST"] = []

            DATA_DICT = PROD_DICT[asin]
            REVIEW_ITEM_LIST = DATA_DICT["REVIEW_ITEM_LIST"]

            # ----------------------------------------------------------------------------------------------
            # print("\n\n")
            # print(review)
            # print("\n\n\n\n")
            review = re.sub(r'http\S+', '', review)
            review = remove_word4(review)
            lemm_review = lemm_sent_process4(review,
                                             remove_stopwords=False,
                                             summary=False,
                                             mode="spacy",
                                             withdot=True)
            lemm_review_len = len(lemm_review.split(" "))

            lemm_review = lemm_review.split(" .")
            lemm_review = [
                line + " . " for line in lemm_review if len(line) > 2
            ]

            # print("\n\n\n\n")
            # [print(line)  for line in lemm_review]
            # break
            # ---------------------------------------------------------------------------------------------
            # print("\n\n")
            # summary = '''
            # Light weight and smaller size = Fantastic!
            # '''
            # print(summary)
            # print("\n\n\n\n")
            summary = re.sub(r'http\S+', '', summary)
            summary = remove_word4(summary)

            lemm_summary = lemm_sent_process4(summary,
                                              remove_stopwords=False,
                                              summary=True,
                                              mode="spacy",
                                              withdot=False)
            lemm_summary_len = len(lemm_summary.split(" "))

            # print("\n\n\n\n")
            # print(lemm_summary)
            # break
            # ----------------------------------------------------------------------------------------------

            item_dict = {
                'review_ID': review_ID,
                "review": review,
                "overall": overall,
                "vote": vote,
                'summary': summary,
                'lemm_review': lemm_review,
                'lemm_review_len': lemm_review_len,
                'lemm_summary': lemm_summary,
                'lemm_summary_len': lemm_summary_len
            }

            REVIEW_ITEM_LIST.append(item_dict)
            DATA_DICT["REVIEW_ITEM_LIST"] = REVIEW_ITEM_LIST
            PROD_DICT[asin] = DATA_DICT
            pbar.set_description("%s REVIEW_ITEM_LIST " % (category1))
            pbar.update(1)

    DEL_ASIN = []
    for asin, DATA_DICT in PROD_DICT.items():
        if "REVIEW_ITEM_LIST" not in DATA_DICT: DEL_ASIN.append(asin)

    for asin in DEL_ASIN:
        del PROD_DICT[asin]

    return PROD_DICT
Beispiel #10
0
def parseTranscripts():
    # loading to a dataframe
    #df = pd.read_table("feedback_ansi.txt", header=(0))
    df = pd.read_excel("clean_reviews_numbered.xlsx", header=(0))
    df = df.dropna()

    # throw error if output file already exists:
    if os.path.isfile('adhoc_chunked_output.csv'):
        exit('Older output file already exists!')
    else:
        with open('adhoc_chunked_output.csv', 'a') as f:
            f.write('doc_id,cleanPhrase,mergedPhrase,parsedNouns' + '\n')

    # cleaning transcripts after collecting lemmas using spacy lemmatizer
    df_clean_trans = rc.DataFrame(columns=['phrase', 'nouns'])

    print(datetime.datetime.now())
    start_time = datetime.datetime.now()
    print('Starting the process')
    rowNum = 0
    nlp = en_core_web_sm.load()
    spacyPhraseDict = {}
    nounDict = {}

    #phrased and cleaned trans dataframe:
    df = phraseBuilder(df)
    replaceDict = {}
    # load word2vec model
    model = word2vec.Word2Vec.load('../VEC_MODELS/fid_w2vec_pos_model')
    for index, row in df.iterrows():
        phrase = row['finalTrans']
        doc_id = row['RowNum']
        rowNum += 1
        print('Processing Row : %s' % rowNum)
        # spacy processing:
        spacyPhrase = nlp(phrase.decode('utf-8'))
        # buildWordTagDict(wordTagDict,spacyPhrase)
        posTaggedWordPhrase = ' '.join(
            string.replace(w.lemma_, '_', '-') + '_' + wordPosTag(w)
            for w in spacyPhrase if w.lemma_ not in spacy.en.STOP_WORDS)
        nounTaggedWords = ' '.join(
            string.replace(w.lemma_, '_', '-') for w in spacyPhrase
            if (w.pos_ == 'NOUN' or w.pos_ == 'PROPN' or w.pos_ == 'NUM')
            and w.lemma_ not in spacy.en.STOP_WORDS)

        interaction_key = doc_id
        if interaction_key not in spacyPhraseDict:
            spacyPhraseDict[interaction_key] = posTaggedWordPhrase
            nounDict[interaction_key] = nounTaggedWords

    # use spacy tags to create word-tag freq dictionary for the entire corpus
    wordFreqDict = wordListToFreqDict(list(spacyPhraseDict.values()))

    # now we can use the word-tag dict to start replacing words and reducing our sample space
    print('Reducing word feature space so as to reduce variants of words')
    rowNum = 0
    newWordCounter = 0
    for index, row in df.iterrows():
        #print(spacyPhrase)
        #print(nounPhrase)
        rowNum += 1
        interaction_key = row['RowNum']
        spacyPhrase = spacyPhraseDict[interaction_key]
        nounPhrase = nounDict[interaction_key]
        print('Processing Row : %s' % rowNum)

        # calculate similarity and reduce feature space by reducing words using word2vec and wordnet intersection
        # building sentence one word at a time
        sen = []
        for taggedWord in spacyPhrase.split():
            wordKey = taggedWord  # already in the key format, can be used later
            baseWord, tag = taggedWord.split('_')
            # make sure we see each word only once
            if wordKey in wordFreqDict and baseWord != '' and tag != '':
                wordFreq = wordFreqDict[wordKey]
                # first check the replaceDict if this key is already present, replace if present and skip the loop iteration
                if wordKey in replaceDict:
                    replaceWord = replaceDict[wordKey]
                    w, t = replaceWord.split('_')
                    sen.append(w)
                    continue
                # continue if no replacement word found in the replaceDict
                # word that is not present in the model or is a unique word
                newWordCounter += 1
                if not wordnet.synsets(baseWord,
                                       tag) or baseWord not in model.wv.vocab:
                    sen.append(baseWord)
                    replaceDict[wordKey] = wordKey
                # getting best replacement from word2vec context, note that synset only gets the pos tag synonymns
                # hence even though we don't fetch word2vec context we should be finding the correct replacement word
                else:
                    synonyms = wordnet.synsets(baseWord, tag)
                    lemmas = set(
                        chain.from_iterable(
                            [word.lemma_names() for word in synonyms]))
                    synlist = [str(term) for term in lemmas]
                    model_similar_words = model.most_similar(
                        positive=[baseWord], topn=10)
                    modellist = []
                    for item in model_similar_words:
                        item = str(item)
                        item = item.replace('\'', '')
                        item = item.replace('(', '')
                        item = item.replace(')', '')
                        term, score = item.split(',')
                        modellist.append(term)

                    # common words between synsets and w2vec may be more than one
                    commonWord = [
                        word for word in modellist if word in synlist
                    ]
                    if (len(commonWord) != 0):
                        score = -1
                        for word in commonWord:
                            key = word + '_' + tag
                            score = wordFreq
                            if key in wordFreqDict:
                                score_ = wordFreqDict[key]
                            else:
                                score_ = score
                            if score_ > score:
                                score = score_
                                maxScoreWord = word

                        if (score > wordFreq):
                            sen.append(maxScoreWord)
                            # word substitution happening, cache it in the dict
                            replaceDict[wordKey] = maxScoreWord + '_' + tag
                            print('Replacing word :: ' + wordKey +
                                  ' with :: ' + maxScoreWord)
                        else:
                            # substitution not happening for the word, add original to the dict
                            sen.append(baseWord)
                            replaceDict[wordKey] = wordKey

                    else:
                        sen.append(baseWord)
                        if wordKey not in replaceDict:
                            replaceDict[wordKey] = wordKey

            else:
                sen.append(baseWord)

        # phrase = ' '.join(wordnet.synsets(w)[0].lemmas()[0].name() for w in phrase.split())
        phrase = ' '.join(w for w in sen)
        phrase = phrase.replace('-PRON-', '')
        phrase = re.sub(r'\b\w{1,2}\b', '', phrase)
        phrase = phrase.replace('\'', '')
        phrase = re.sub(r'(\s)\w+-+\s', ' ', phrase)
        phrase = re.sub(r'(\s)-+\w+\s', ' ', phrase)
        phrase = phrase.replace(' - ', '')
        phrase = ' '.join(phrase.split())
        #print(phrase)

        nounPhrase = nounPhrase.replace('-PRON-', '')
        nounPhrase = re.sub(r'\b\w{1,2}\b', '', nounPhrase)
        nounPhrase = nounPhrase.replace('\'', '')
        nounPhrase = re.sub(r'(\s)\w+-+\s', ' ', nounPhrase)
        nounPhrase = re.sub(r'(\s)-+\w+\s', ' ', nounPhrase)
        nounPhrase = nounPhrase.replace(' - ', '')
        nounPhrase = ' '.join(nounPhrase.split())
        #print(nounPhrase)

        # collecting all clean lemmatized phrases by interaction id in a dataframe, this df will be used with gensim phraser
        df_clean_trans.append_row(index, {
            'doc_id': interaction_key,
            'phrase': phrase,
            'nouns': nounPhrase
        })

        # except:
        # print('Encountered issue with Spacy unicode token!!')

    # modify this function to directly read data from disk using LineSentences function of Gensim
    # function to build gensim phraser

    # convert to normal dataframe
    print('New words seen :: ' + str(newWordCounter))
    print('Num of words replaced : ' + str(len(replaceDict.keys())))

    # pickle the dictionary:
    # with open('../REPLACE_DICT_PICKLE', 'wb') as handle:
    #	pickle.dump(replaceDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    data_dict = df_clean_trans.to_dict(index=False)
    df_clean_trans = pd.DataFrame(data_dict,
                                  columns=df_clean_trans.columns,
                                  index=df_clean_trans.index)

    # feed tokenized transcript through the POS context extractor logic, which will create final file for LDA input
    chunkProcessedTrans(df_clean_trans)

    # write gensim phrase to file
    #nlpOutput(df_clean_trans)

    end_time = datetime.datetime.now()
    print('Completed process in ::' + str(end_time - start_time))
    exit(0)
Beispiel #11
0
"""
__init__.py

library module for the app
"""
import json
import os
import tempfile

import en_core_web_sm
nlp = en_core_web_sm.load()

from app import app


def nlp_over_lines_as_blob(lines, *extractors):
    """Given an iterable collection of lines of text, generates complete
    sentences and runs a series of extractor functions over each sentence.

    Yields a tuple for each sentence containing the results of each extractor
    """
    blob = ' '.join(lines)
    doc = nlp(blob)
    for sent in doc.sents:
        extracted = [ext(sent) for ext in extractors]
        yield tuple(extracted)

def nlp_over_lines(lines, *extractors):
    """Given an iterable collection of lines of text, runs a series of
    extractor functions over each line.
    wordsByRake.append(rake.apply(docs[i]))

def listOfLists(lst):
    temp =[]
    for i in range(0,len(lst)):
        if(lst[i][1] > 4.0): #this can be changed according to the required number of tags
            temp.append(lst[i][0])
    return [elem for elem in temp]

for i in range(0,len(wordsByRake)):
    trainDocs.insert(i, listOfLists(wordsByRake[i]))

wordsByRake = [] #to avoid memory usage
docs = [] #to avoid memory usage

nlp = en_core_web_sm.load() #loading the model
ruler = EntityRuler(nlp) 

pattern = []; #to store the pattern for tagging each doc

def labelDecider(txt,tag,count):
    patternDict = {}
    patternDict = {'label': str(tag) ,'pattern': txt[count]}  
    return patternDict

for i in range(0,len(trainDocs)): #0 to 80
    docCount = 0;
    case = trainDocs[i]
    if(len(trainDocs[i]) != 0):
        if(len(trainTags[i]) == 1): #for docs with one lable
            tagCase = trainTags[i]
Beispiel #13
0
import pandas as pd
import pickle5 as pickle

from app.lrs.base_model import BaseModel

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import en_core_web_sm

nlp = en_core_web_sm.load()  # https://spacy.io/usage/models#production
#https://pypi.org/project/spacy-langdetect/
from spacy_langdetect import LanguageDetector
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

MODEL_PATH = 'app/models/'
REPLACED_WORDS = pickle.load(open(MODEL_PATH + "replaced_words.pkl", "rb"))
stop_list = pickle.load(open(MODEL_PATH + "stop_list.pkl", "rb"))

ENGINE_TYPES = ['LDA', 'LDA', 'NMF', 'NMF']
ENGINE_VERSIONS = [
    'model_12_content_3_topic_80_20', 'model_12_content_10_topic_80_20',
    'nmf_model_13_content_3_topic_80_20', 'nmf_model_13_content_10_topic_80_20'
]


# Utility functions
def make_suggestions(TAR_6, engine):
    # P R E D I C T I O N
    norms = pickle.load(
Beispiel #14
0
def parseTranscripts():

    #loading to a dataframe
    df = pd.read_table("outTrans.tsv", sep='\t', header=(0))

    #throw error if output file already exists:
    if os.path.isfile('SCRIPT_NLP_CONTEXT.csv'):
        exit('Older output file already exists!')
    else:
	with open('SCRIPT_NLP_CONTEXT.csv', 'a') as f:
            f.write('interaction_id,nice_interaction_id,customer_id,call_date,cleanpPhrase,mergedPhrase'+'\n')


    #cleaning transcripts after collecting lemmas using spacy lemmatizer
    df_clean_trans = rc.DataFrame(columns=['interaction_id','nice_interaction_id','customer_id','call_date', 'phrase'])

    print(datetime.datetime.now())
    start_time = datetime.datetime.now()
    print('Starting the process')
    rowNum = 0
    nlp = en_core_web_sm.load()
    spacyPhraseDict = {}
    '''
    if os.path.isfile('../REPLACE_DICT_PICKLE'):
	with open('../REPLACE_DICT_PICKLE', 'rb') as handle:
    		replaceDict = pickle.load(handle)
    else:
	replaceDict = {}
    '''
    replaceDict = {}
    #load word2vec model
    print('Number of words in the list :: '+str(len(replaceDict.keys())))
    model = word2vec.Word2Vec.load('../VEC_MODELS/fid_w2vec_pos_model')
    for index, row in df.iterrows():
        phrase = row['transcript']
        interaction_id = row['interaction_id']
        nice_interaction_id = row['nice_interaction_id']
	#nice_interaction_id = '123'
        customer_id = row['customer_id']
        call_date = row['call_date']
        rowNum += 1
        print('Processing Row : %s' % rowNum)

        #rawPhrase = basicClean(phrase)

        # clean trans as per business logic, excluding generic rep conversations:
        cleanPhrase = cleanTranscript(phrase)

        #spacy processing:
        spacyPhrase = nlp(cleanPhrase.decode('utf-8'))
	#buildWordTagDict(wordTagDict,spacyPhrase)
        posTaggedWordPhrase = ' '.join(w.lemma_ + '_' + wordPosTag(w) for w in spacyPhrase if w.lemma_ not in spacy.en.STOP_WORDS)


	if interaction_id not in spacyPhraseDict:
		spacyPhraseDict[interaction_id] = posTaggedWordPhrase

   

    #use spacy tags to create word-tag freq dictionary for the entire corpus
    wordFreqDict = wordListToFreqDict(list(spacyPhraseDict.values()))	

    # now we can use the word-tag dict to start replacing words and reducing our sample space
    print('Reducing word feature space so as to reduce variants of words')
    rowNum = 0
    newWordCounter = 0
    for index, row in df.iterrows():
        phrase = row['transcript']
        interaction_id = row['interaction_id']
        nice_interaction_id = row['nice_interaction_id']
        #nice_interaction_id = '123'
        customer_id = row['customer_id']
        call_date = row['call_date']
	spacyPhrase = spacyPhraseDict[interaction_id]
 	rowNum += 1
        print('Processing Row : %s' % rowNum)

        #calculate similarity and reduce feature space by reducing words using word2vec and wordnet intersection
	# building sentence one word at a time
	sen = []
	for taggedWord in spacyPhrase.split():
		wordKey = taggedWord #already in the key format, can be used later
		baseWord,tag = taggedWord.split('_')
		# make sure we see each word only once
		if wordKey in wordFreqDict and baseWord != '' and tag != '':
			wordFreq = wordFreqDict[wordKey]
			#first check the replaceDict if this key is already present, replace if present and skip the loop iteration
			if wordKey in replaceDict:
				replaceWord = replaceDict[wordKey]
				w,t = replaceWord.split('_')
				sen.append(w)
				continue
			#continue if no replacement word found in the replaceDict
			#word that is not present in the model or is a unique word	
			newWordCounter += 1
			if not wordnet.synsets(baseWord,tag) or baseWord not in model.wv.vocab:
				sen.append(baseWord)
				replaceDict[wordKey] = wordKey 
			#getting best replacement from word2vec context, note that synset only gets the pos tag synonymns
			#hence even though we don't fetch word2vec context we should be finding the correct replacement word
			else:
				synonyms = wordnet.synsets(baseWord,tag)
				lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
   				synlist = [str(term) for term in lemmas]
				model_similar_words = model.most_similar(positive = [baseWord],topn = 10)
				modellist = []
				for item in model_similar_words:
					item = str(item)
					item = item.replace('\'','')
					item = item.replace('(','')
					item = item.replace(')','')
					term,score = item.split(',')
					modellist.append(term)

				#common words between synsets and w2vec may be more than one
				commonWord = [word for word in modellist if word in synlist]
				if(len(commonWord) != 0):
					score = -1	
					for word in commonWord:
						key = word + '_' + tag
						score = wordFreq
						if key in wordFreqDict:
							score_ = wordFreqDict[key]
						else:
							score_ = score
						if score_ > score:
							score = score_
							maxScoreWord = word


					if(score > wordFreq):
						sen.append(maxScoreWord)
						#word substitution happening, cache it in the dict
						replaceDict[wordKey] = maxScoreWord + '_'+tag
						print('Replacing word :: '+ wordKey + ' with :: ' + maxScoreWord)
					else:
						#substitution not happening for the word, add original to the dict
						sen.append(baseWord)
						replaceDict[wordKey] = wordKey
						
				#nothing common was found with w2vec, pick up best replacement from synsets
				else:

					'''
					#pick max frequency from the wordfreqdict
					for syn in lemmas:
						synKey = syn+'_'+tag
						# word-tag distribution likely to follow zipf's law
						if synKey in wordFreqDict and wordFreqDict[synKey] > wordFreq:
							#replace the word
							wordFreq = wordFreqDict[synKey] #if there is a different higher freq word in synset
							replaceDict[wordKey] = synKey
							baseWord = syn


					'''

					sen.append(baseWord)
					if wordKey not in replaceDict:
						replaceDict[wordKey] = wordKey
			

	

        #phrase = ' '.join(wordnet.synsets(w)[0].lemmas()[0].name() for w in phrase.split())
	phrase = ' '.join(w for w in sen)
        phrase = phrase.replace('-PRON-_n','')
        phrase = phrase.replace(' whatev ', ' ')
        phrase = re.sub(r'\b\w{1,2}\b', '', phrase)
        phrase = phrase.replace('\'', '')
        phrase = phrase.replace('  ', ' ')
        phrase = phrase.strip()
        #print(phrase)

            #collecting all clean lemmatized phrases by interaction id in a dataframe, this df will be used with gensim phraser
        df_clean_trans.append_row(index,{'interaction_id' : interaction_id,
                                         'nice_interaction_id' : nice_interaction_id,
                                         'customer_id' : customer_id,
                                         'call_date' : call_date,
                                         'phrase' : phrase})

        #except:
            #print('Encountered issue with Spacy unicode token!!')


    #modify this function to directly read data from disk using LineSentences function of Gensim
    #function to build gensim phraser


    #convert to normal dataframe
    print('New words seen :: '+str(newWordCounter))
    print('Num of words replaced : '+str(len(replaceDict.keys())))

    #pickle the dictionary:
    #with open('../REPLACE_DICT_PICKLE', 'wb') as handle:
    #	pickle.dump(replaceDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    data_dict = df_clean_trans.to_dict(index=False)
    df_clean_trans = pd.DataFrame(data_dict, columns=df_clean_trans.columns, index=df_clean_trans.index)
    df_phrased = phraseBuilder(df_clean_trans)

    #feed tokenized transcript through the POS context extractor logic, which will create final file for LDA input
    chunkProcessedTrans(df_phrased)

    #write gensim phrase to file
    #nlpOutput(df_phrased)	

    end_time = datetime.datetime.now()
    print('Completed process in ::' + str(end_time - start_time))
    exit(0)
Beispiel #15
0
def spacy_tokenizer(text):
    nlp = en_core_web_sm.load()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]
    tokens = [i for i in tokens if i != '-PRON-']
    return tokens 
Beispiel #16
0
# coding: utf-8
from datetime import datetime, timedelta

import en_core_web_sm
from flask import g, render_template, request, jsonify, make_response
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.orm import relationship
from flask_oauthlib.provider import OAuth2Provider
from flask_oauthlib.contrib.oauth2 import bind_sqlalchemy
from flask_oauthlib.contrib.oauth2 import bind_cache_grant
from Pipelines import ner, ner_negation

# initialize spacy for preprocessing
spacy_model = en_core_web_sm.load()
# initialize SQLAlchemy instance to hold users
db = SQLAlchemy()


class User(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(40),
                         unique=True,
                         index=True,
                         nullable=False)

    def check_password(self, password):
        return True


class Client(db.Model):
    id = db.Column(db.Integer, primary_key=True)
#!/usr/bin/env python
"""Create annotation tiers using spacy"""
from annotator import Annotator, AnnoSpan, AnnoTier
import re
import en_core_web_sm as spacy_model
spacy_nlp = spacy_model.load()


class TokenSpan(AnnoSpan):
    def __init__(self, token, doc):
        self.doc = doc
        self.start = token.idx
        self.end = token.idx + len(token)
        self.label = token.text
        self.token = token


class SentSpan(AnnoSpan):
    def __init__(self, span, doc):
        self.doc = doc
        self.start = span.start_char
        self.end = span.end_char
        self.label = span.text
        self.span = span


class SpacyAnnotator(Annotator):
    def annotate(self, doc):
        ne_spans = []
        token_spans = []
        ne_chunk_start = None
Beispiel #18
0
def load_mode(model_name):
    if model_name == 'en':
        return enmodel.load()
    return None
Beispiel #19
0
import sys

sys.path.insert(0, "..")
import os
import en_core_web_sm
from utils.data_preprocess.annotation_loader import AnnotationLoader
from utils.params import TRAIN_ANNOTATION_SRC, DEV_ANNOTATION_SRC

NLP = en_core_web_sm.load()


def _get_np_root(np_phrase):
    for w in NLP(np_phrase):
        if w.dep_ == "ROOT":
            return w.text


# because annotations gold label and np chunks can be slightly different
# e.g gold value is: "Air Line Pilots Association" and the np chunk value is "The Air Line Pilots Association"
# we tell if there are referring to the same object by looking at the ROOT word at the NP-chunk/Gold-val
def _get_label(labels, sent_id, per, org):
    label = 0
    org_root = _get_np_root(org)
    # loop over list of [ .. (per, org, org_root) .. ]
    for sent_per, sent_org, sent_org_root in labels.get(sent_id, []):
        # check similarity between PERSON and ORGANIZATION-ROOT
        if _get_np_root(per) == _get_np_root(
                sent_per) and org_root == sent_org_root:
            label = 1
            break
    return label
import en_core_web_sm
from spacy.matcher import Matcher

nlp = en_core_web_sm.load()  #load in the english pretrained spacy model
pPt = Matcher(nlp.vocab)  #create a matcher that we can add patterns to

#-------------------------patterns being added to the matcher----------------------------
#pattern to recognize "(name) (optional ,) (optional the) (any lemmatized title)"
pPt.add("pat1", None, [{
    "POS": "PROPN"
}, {
    "POS": "PUNCT",
    "OP": "?"
}, {
    "POS": "DET",
    "OP": "?"
}, {
    "LEMMA": {
        "IN": [
            "director", "engineer", "governer", "mayor", "manager", "official",
            "CEO", "COO", "commissioner", "spokesperson", "spokeswoman",
            "spokesman", "representative", "chief", "coordinator"
        ]
    }
}])
#pattern to recognize "(lemmatized verb) (optional noun) (any lemmatized title)"
pPt.add("pat2", None, [{
    "LEMMA": {
        "IN": ["announce", "hazard", "say", "stated", "issued", "warned"]
    }
}, {
Beispiel #21
0
# post请求
from fastapi import FastAPI
from pydantic import BaseModel
import spacy
import en_core_web_sm
en_core_web_sm.load()

nlp = spacy.load("en_core_web_sm")
app = FastAPI()


@app.get('/test/a={a}/b={b}')
def calculate(a: int = None, b: int = None):
    c = a + b
    res = {"res": c}
    return res


@app.get('/')
def hello():
    return "Hello1"


class Item(BaseModel):
    POS: str = None


@app.post('/test')
def calculate(request_data: Item):
    if request_data.POS:
        doc = nlp(request_data.POS)
Beispiel #22
0
    def preprocess(self):
        self.logger.info("****Preparing dataset****")
        if cfg.EMBS_TYPE == 'glove':
            embedding_reader = GloVeReader()
        elif cfg.EMBS_TYPE == 'fasttext':
            embedding_reader = FastTextReader()
        else:
            raise ValueError(f"Unsupported embeddings type {cfg.EMBS_TYPE}")
        pretrained_vectors = embedding_reader.read(cfg.EMBS_FILE)
        vocab = Vocab(embedding_reader.START, embedding_reader.END,
                      embedding_reader.PAD, embedding_reader.UNK, cfg.CSEQ_LEN,
                      cfg.QSEQ_LEN)
        pardir = os.path.dirname(cfg.VOCAB_SAVE)
        if not os.path.exists(pardir):
            os.makedirs(pardir)
        ner = NERTagger(cfg.NER_TAGS_FILE, cfg.CSEQ_LEN)
        pos = PosTagger(cfg.POS_TAGS_FILE, cfg.CSEQ_LEN)
        self.nlp = en_core_web_sm.load()

        train = tfds.load("squad", data_dir="/tf/data/tf_data", split='train')
        AUTOTUNE = tf.data.experimental.AUTOTUNE
        train_context = train.map(lambda x: x['context'],
                                  num_parallel_calls=AUTOTUNE)
        train_question = train.map(lambda x: x['question'],
                                   num_parallel_calls=AUTOTUNE)
        train_ans = train.map(lambda x: x['answers']['text'][0],
                              num_parallel_calls=AUTOTUNE)
        test = tfds.load("squad",
                         data_dir="/tf/data/tf_data",
                         split='validation')
        test_context = test.map(lambda x: x['context'],
                                num_parallel_calls=AUTOTUNE)
        test_question = test.map(lambda x: x['question'],
                                 num_parallel_calls=AUTOTUNE)
        test_ans = test.map(lambda x: x['answers']['text'][0],
                            num_parallel_calls=AUTOTUNE)

        mr = MapReduce()

        self.logger.info("****Preparing training split****")
        train_context = train_context.as_numpy_iterator()
        train_context = mr.process(self.utf8_decoder, train_context)
        train_question = train_question.as_numpy_iterator()
        train_question = mr.process(self.utf8_decoder, train_question)
        train_ans = train_ans.as_numpy_iterator()
        train_ans = mr.process(self.utf8_decoder, train_ans)
        self.logger.info("****Tokenizing training split****")
        train_context = self.nlp.pipe(train_context,
                                      batch_size=128,
                                      n_process=6)
        train_question = self.nlp.pipe(train_question,
                                       batch_size=128,
                                       n_process=6)
        train_ans = self.nlp.pipe(train_ans, batch_size=128, n_process=6)
        self.logger.info("****Tokenized training split****")

        training_context = []
        training_question = []
        training_ans = []
        for context, ques, ans in zip(train_context, train_question,
                                      train_ans):
            ans_start, al = self.substrSearch(ans, context)
            ans_start += 1
            if len(ques) >= 20 or ans_start == -1 or ans_start + al >= 250:
                continue
            training_context.append(context)
            training_question.append(ques)
            ans = np.zeros(cfg.CSEQ_LEN, dtype=np.uint8)
            ans[ans_start:ans_start + al] = 1
            training_ans.append(ans)
        self.logger.info("****Filtered training split****")

        vocab.fit(training_context, training_question, pretrained_vectors, 0,
                  0)
        vocab.save(cfg.VOCAB_SAVE)
        train_cidx = vocab.transform(training_context, "source")
        train_ner = ner.transform(training_context)
        train_pos = pos.transform(training_context)
        train_qidx = vocab.transform(training_question, "target")

        cseq = cfg.CSEQ_LEN
        qseq = cfg.QSEQ_LEN

        def gen():
            for cidx, ner, pos, qidx, ans in zip(train_cidx, train_ner,
                                                 train_pos, train_qidx,
                                                 training_ans):
                yield (cidx, ans, qidx, ner, pos)

        train_dataset = tf.data.Dataset.from_generator(
            gen, (tf.int32, tf.uint8, tf.int32, tf.uint8, tf.uint8),
            (tf.TensorShape([cseq]), tf.TensorShape(
                [cseq]), tf.TensorShape([qseq]), tf.TensorShape(
                    [cseq]), tf.TensorShape([cseq])))

        self.logger.info("****Preparing test split****")
        test_context = test_context.as_numpy_iterator()
        test_context = mr.process(self.utf8_decoder, test_context)
        test_question = test_question.as_numpy_iterator()
        test_question = mr.process(self.utf8_decoder, test_question)
        test_ans = test_ans.as_numpy_iterator()
        test_ans = mr.process(self.utf8_decoder, test_ans)
        self.logger.info("****Tokenizing test split****")
        test_context = self.nlp.pipe(test_context, batch_size=128, n_process=6)
        test_question = self.nlp.pipe(test_question,
                                      batch_size=128,
                                      n_process=6)
        test_ans = self.nlp.pipe(test_ans, batch_size=128, n_process=6)
        self.logger.info("****Tokenized test split****")

        testing_context = []
        testing_question = []
        testing_ans = []
        for context, ques, ans in zip(test_context, test_question, test_ans):
            ans_start, al = self.substrSearch(ans, context)
            ans_start += 1
            if len(ques) >= 20 or ans_start == -1 or ans_start + al >= 250:
                continue
            testing_context.append(context)
            testing_question.append(ques)
            ans = np.zeros(cfg.CSEQ_LEN, dtype=np.uint8)
            ans[ans_start:ans_start + al] = 1
            testing_ans.append(ans)
        self.logger.info("****Filtered test split****")

        test_cidx = vocab.transform(testing_context, "source")
        test_ner = ner.transform(testing_context)
        test_pos = pos.transform(testing_context)
        test_qidx = vocab.transform(testing_question, "target")

        cseq = cfg.CSEQ_LEN
        qseq = cfg.QSEQ_LEN

        def gen():
            for cidx, ner, pos, qidx, ans in zip(test_cidx, test_ner, test_pos,
                                                 test_qidx, testing_ans):
                yield (cidx, ans, qidx, ner, pos)

        test_dataset = tf.data.Dataset.from_generator(
            gen, (tf.int32, tf.uint8, tf.int32, tf.uint8, tf.uint8),
            (tf.TensorShape([cseq]), tf.TensorShape(
                [cseq]), tf.TensorShape([qseq]), tf.TensorShape(
                    [cseq]), tf.TensorShape([cseq])))

        train_dataset = train_dataset.map(self.make_example,
                                          num_parallel_calls=-1)
        test_dataset = test_dataset.map(self.make_example,
                                        num_parallel_calls=-1)

        self.save(train_dataset, test_dataset)
        self.logger.debug(f"Memory freed: {gc.collect()}")