def match_model_to_words(spacy, keystrokes, vectors): """ This is the nearest neighbour function that is run after the networks output. It translates the vector model back into words. """ possible_words = char2words(keystrokes) pred = [] for i in range(len(vectors)): min_num = 0 min_word = "" word_preds = [] for word in possible_words[i]: dv = spacy(word).vector try: dist = np.dot(dv, vectors[i]) / (np.linalg.norm(dv) * np.linalg.norm(vectors[i])) except: pass else: if min_num < dist: min_num = dist min_word = word word_preds.append(word) pred.append(min_word) return pred
def main(): for play in modernPlays: startTime = time.time() print('Retrieving text for play: %s' % play) playText = retrievePlayText(play) print('Retrieved text, substituting pronouns') playText = substitutePronouns(playText, verbose=False) print('Substituted pronouns, resolving coreferences') playText = coreferenceResolve(playText, verbose=False) print('Coreferences resolved, parsing dependencies') playText = spacy(playText, verbose=False) print('Dependencies parsed, extracting relationships') relations = extractRelationships(playText, verbose=False) print('Relationships extracted, post processing triples') relations = postProcess(relations, play, verbose=False) print('Triples post processed, writing to DB') writeToDB(relations, play, verbose=False) print('Relations written to DB, writing relations to file') writeToFile(relations, play + outputFile, verbose=False) print('Relations written to file: %s' % play + outputFile) endTime = time.time() totalSeconds = endTime - startTime m, s = divmod(totalSeconds, 60) h, m = divmod(m, 60) print( 'Done with %s, full pipeline took %d hours, %02d minutes, %02d seconds' % (play, h, m, s)) return
def gettextfromvoice(): file = request.files['file'] r = sr.Recognizer() ext = file.filename.split('.')[1] if ext == 'mp3': file.save(os.path.join(app.config["AUDIO_UPLOADS"], "oldfile.mp3")) oldfile = os.path.join(app.config["AUDIO_UPLOADS"], "oldfile.mp3") newfile = os.path.join(app.config["AUDIO_UPLOADS"], "newfile.wav") subprocess.call(['ffmpeg', '-y', '-i', oldfile, newfile]) harvard = sr.AudioFile(newfile) elif ext == 'mp4' or ext == 'avi': file.save(os.path.join(app.config["AUDIO_UPLOADS"], "oldfile." + ext)) oldfile = os.path.join(app.config["AUDIO_UPLOADS"], "oldfile." + ext) newfile = os.path.join(app.config["AUDIO_UPLOADS"], "newfile.wav") clip = mp.VideoFileClip(oldfile) clip.audio.write_audiofile(newfile) harvard = sr.AudioFile(newfile) elif ext == 'wav': file.save(os.path.join(app.config["AUDIO_UPLOADS"], "oldfile.wav")) harvard = sr.AudioFile(os.path.join('./uploads/', "oldfile.wav")) else: return jsonify({"status": "failed", "message": "invalid file format"}) with harvard as source: audio = r.record(source) textfromaudio = r.recognize_sphinx(audio) subject = spacy(textfromaudio) relevantNews = getrelevantNews(subject, textfromaudio) return jsonify({"data": relevantNews, "subject": subject})
def getspacy(): somejsonfile = request.get_json() subject = spacy(somejsonfile['data']) # textblob(somejsonfile['data']) relevantNews = getrelevantNews(subject) print(relevantNews) return jsonify({"data": relevantNews, "subject": subject})
def process_html_files(contents): ABV = '' IBU = '' SRM = '' keywords = [] specifications = [] soup = BeautifulSoup(contents, 'html.parser') beer_style = str( soup.find_all('h3')).split('"recipeCuisine">')[1].split('</a></h3>')[0] if re.search('mead', beer_style, re.IGNORECASE): return None if re.search('cider', beer_style, re.IGNORECASE): return None recipe_name = str(soup.find('h3')).split('|')[0].replace('<h3>', '').strip() if re.search('mead', recipe_name, re.IGNORECASE): return None if re.search('cider', recipe_name, re.IGNORECASE): return None recipe_ingredients = str( soup.find('div', itemprop="ingredients").get_text()).strip().replace('|', '') recipe_ingredients_stemmed = stem_words(recipe_ingredients) recipe_ingredients_spacy = spacy(recipe_ingredients) recipe_instructions = soup.find('div', itemprop="recipeInstructions").get_text() recipe_specifications = soup.find(class_="specs").get_text() recipe_specifications = re.sub(r' ', '', recipe_specifications) recipe_specifications = re.sub(r'n/a', ' n/a ', recipe_specifications) recipe_specifications = re.sub(r'ABV', ' ABV ', recipe_specifications) recipe_specifications = re.sub(r'IBU', ' IBU ', recipe_specifications) recipe_specifications = re.sub(r'SRM', ' SRM ', recipe_specifications) recipe_specifications = re.sub(r'Boil', ' Boil ', recipe_specifications) recipe_specifications = re.sub(r'Efficiency', ' Efficiency', recipe_specifications) recipe_specifications = re.sub(r'byvolume', ' byvolume', recipe_specifications) specifications = recipe_specifications.split() if specifications: for idx, row in enumerate(specifications): if 'ABV' in row: ABV = specifications[idx + 1].replace(':', '').replace('%', '') if 'IBU' in row: IBU = specifications[idx + 1].replace(':', '') if 'SRM' in row: SRM = specifications[idx + 1].replace(':', '') if ABV and IBU and SRM: keywords.append(ABV) keywords.append(IBU) keywords.append(SRM) keywords.append(beer_style) load_mongo(recipe_name, beer_style, recipe_ingredients, recipe_ingredients_stemmed, recipe_ingredients_spacy, recipe_instructions, recipe_specifications, ABV, IBU, SRM, keywords)
def quote(user_imput): if user_imput == "spacy": input_text = spacy() elif user_imput == "markov": input_text = markov() else: input_text = markov() meme = Meme(input_text) filename = meme.save() return filename
def gettextfromimage(): file = request.files['image'] file.save(os.path.join(app.config["IMAGE_UPLOADS"], file.filename)) pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" if file.filename.split('.')[1] == 'webp': image = Image.open(os.path.join('./uploads/', file.filename)).convert("RGB") else: image = Image.open(os.path.join('./uploads/', file.filename)) textfromimage = pytesseract.image_to_string(image, lang='eng') subject = spacy(textfromimage) relevantNews = getrelevantNews(subject, textfromimage) return jsonify({"data": relevantNews, "subject": subject})
def split_sentence(text): ''' Splits review into a list of sentences using spacy's sentence parser ''' sentence = spacy(text) bag_sentence = [] start = 0 for token in sentence: if token.sent_start: bag_sentence.append(sentence[start:(token.i - 1)]) start = token.i if token.i == len(sentence) - 1: bag_sentence.append(sentence[start:(token.i + 1)]) return bag_sentence
def getrelevantNews(nouns, text): # googlenews.clear() # googlenews.search(nouns) # relevantNews = googlenews.result() # print(relevantNews) news = newsapi.get_everything(q=nouns, language='en') # news = requests.get("https://gnews.io/api/v3/search?q=nouns&token=911faa3568a07ec3606a958da028ee59") # if(text in news): # print(news["articles"]) relevantNews = json.dumps(news["articles"]) newsArticles = news["articles"] print(newsArticles) res = [] userNews = nlp(spacy(text)) for i in range(0, len(newsArticles)): articleNews = nlp(spacy(newsArticles[i]["description"])) # print(text) # print(newsArticles[i]["description"]) # print(userNews.similarity(articleNews)) # print("************") if userNews.similarity(articleNews) > 0.50: res.append(newsArticles[i]) print(res) return json.dumps(res)
def get_embedding(string, comp, vecs, weights): if pd.isnull(string): return [] else: if comp == "d": return d(string, vecs, weights) if comp == "new": return new(string, vecs, weights) if comp == "spacy": return spacy(string, vecs) if comp == "tags": return tags(string, vecs) if comp == "dilate": return decompose(string, vecs) x = [] string = string.strip().split('_') if (len(string) == 1) and (string[0].strip() in vecs): return vecs.query(string[0]) elif all(word.strip() in vecs for word in string): for word in string: x.append(vecs.query(word.strip())) if (comp == "multiply"): return multiply(x) elif (comp == "add"): return add(x) elif (comp == "lapata"): return lapata_combination(x) elif (comp == "decompose"): return decompose(x) elif (comp == "weight"): return weighted_add(x) elif (comp == "average"): average(x) elif (comp == "lapata_combination"): lapata_combination(x) else: return []
def spacy_lemma(speech): doc = spacy(speech) return " ".join([token.lemma_ for token in doc])
# df = df.replace(r'https?:\/\/.*[\r\n]*', "", regex = True) # df = df.replace(r'http\S+', "", regex = True) # df = df.replace(r'tinyurl\S+', "", regex = True) # df = df[:32810] # df.to_csv("tweets_csv.csv", index=False, sep = "|") bt = BotTweet(os.path.join("data", "trump_speeches.txt")) print("!!!", bt.make_short_tweet()) return bt.make_short_tweet() def quote(user_imput): if user_imput == "spacy": input_text = spacy() elif user_imput == "markov": input_text = markov() else: input_text = markov() meme = Meme(input_text) filename = meme.save() return filename if __name__ == "__main__": word = spacy() print(word)
def feature_sentiment(sentence): ''' input: dictionary and sentence function: appends dictionary with new features if the feature did not exist previously, then updates sentiment to each of the new or existing features output: updated dictionary ''' counter_positive = collections.Counter() counter_negative = collections.Counter() sent_dict = Counter() sentence = spacy(sentence) for token in sentence: # print(sent_dict) # print(token.text,token.dep_, token.head, token.head.dep_) # check if the word is an opinion word, then assign sentiment if token.text in opinion_words: # Words such as worked / crashed / well / useless / enjoyed # print(token) # print(token.text, 'main_token') sentiment = 1 if token.text in pos else -1 # if word is in postive opinion words then add 1 - if in neg opinion words --> substract 1 # if target is an adverb modifier (i.e. pretty, highly, etc.) # but happens to be an opinion word, ignore and pass if (token.dep_ == "advmod"): # print(token, 'advmod') continue elif ( token.dep_ == "amod"): # adjectical modifier --> amazing lightless of the laptop "amazing is the adjectical mod" sent_dict[token.head.text] += sentiment # important --> amazing is the amod here. Therefore amazing.head = ligthless is added to the dict if sentiment > 0: counter_positive[token.head.text] += sentiment elif sentiment < 0: counter_negative[token.head.text] += sentiment # for opinion words that are adjectives, adverbs, verbs... else: for child in token.children: # for example: issues has child many, which is an adjectival modifier # print(child, 'child', child.dep_) # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment # This could be better updated for modifiers that either positively or negatively emphasize # if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words): #does this have to be in opinion words if ((child.dep_ == "amod") or (child.dep_ == "advmod")): # does this have to be in opinion words sentiment *= 1.5 # print(sentiment, token, 'token sentiment') # check for negation words and flip the sign of sentiment --> double negative e.g. not amazing if child.dep_ == "neg": sentiment *= -1 continue for child in token.children: # if verb, check if there's a direct object --> lijdend voorwerp in dutch (enjoyed(verb) the keyboard light(direct object)) if (token.pos_ == "VERB") & (child.dep_ == "dobj"): sent_dict[child.text] += sentiment if sentiment > 0: counter_positive[token.head.text] += sentiment elif sentiment < 0: counter_negative[token.head.text] += sentiment # check for conjugates (a AND b), then add both to dictionary # Example: Enjoyed both the screen and the keyboard light subchildren = [] conj = 0 for subchild in child.children: if subchild.text == "and": conj = 1 if (conj == 1) and (subchild.text != "and"): subchildren.append(subchild.text) conj = 0 for subchild in subchildren: sent_dict[subchild] += sentiment if sentiment > 0: counter_positive[token.head.text] += sentiment elif sentiment < 0: counter_negative[token.head.text] += sentiment # check for negation for child in token.head.children: if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words): sentiment *= 1.5 # check for negation words and flip the sign of sentiment if (child.dep_ == "neg"): sentiment *= -1 # check for nouns for child in token.head.children: noun = "" if (child.pos_ == "NOUN") and (child.text not in sent_dict): # OS crashed repeatedly --> noun = child.text # Check for compound nouns for subchild in child.children: if subchild.dep_ == "compound": noun = subchild.text + " " + noun sent_dict[noun] += sentiment if sentiment > 0: counter_positive[token.head.text] += sentiment elif sentiment < 0: counter_negative[token.head.text] += sentiment # return sent_dict return counter_positive, counter_negative
def getspacy(): somejsonfile = request.get_json() text = somejsonfile['data'] subject = spacy(somejsonfile['data']) relevantNews = getrelevantNews(subject, text) return jsonify({"data": relevantNews, "subject": subject})
low_count = 0 low_counts = 0 tIoU_th_highest = 0.7 tIoU_th_high = 0.5 tIoU_th_mid = 0.3 tIoU_th_low = 0.1 tIoU_list = [] true_tIoU_list = [] gt_tIoU_list = [] with torch.no_grad(): for i, test_data in enumerate(test_data_loader): # print('i:', i) sentence = test_data['sentence'] for sen in sentence: sen = ''.join(map(str, sen)) sen = spacy(sen) verb_list = [] for token in sen: # verb_list.append(token.head.text) if token.pos_ == 'VERB': verb_list.append(token.text) sentence = vocab.return_idx(sentence).to(device) verb = vocab.return_idx([verb_list]).to(device) verb = verb[:, :max_verb_len] proposed_videos = test_data['video'] context_video = test_data['context_video'] if len(proposed_videos.size()) == 4: proposed_videos = proposed_videos.squeeze(dim=0) context_video = context_video.squeeze(dim=0)
### Train ### start_train = time.time() # print('epoch:', epoch) best_count = 0 worst_count = train_batch_size model_cap.train() model_vid.train() model_grd.train() print('--- Train ---') for iter_num, batch_data in enumerate(train_data_loader): # print('epoch: {}, iter: {}'.format(epoch, iter_num)) batch_sentence = batch_data['sentence'] batch_verb_list = [] for sentence in batch_sentence: sentence = ''.join(map(str, sentence)) sentence = spacy(sentence) verb_list = [] for token in sentence: # verb_list.append(token.head.text) if token.pos_ == 'VERB': verb_list.append(token.text) batch_verb_list.append(verb_list) batch_sentence = vocab.return_idx(batch_sentence).to(device) batch_verb = vocab.return_idx(batch_verb_list).to(device) batch_verb = batch_verb[:, :max_verb_len] batch_video = batch_data['video'] batch_context_video = batch_data['context_video'] if isinstance(batch_video, list): batch_video = torch.cat([batch_video[i] for i in range(len(batch_video))], dim=0)