Exemple #1
0
def match_model_to_words(spacy, keystrokes, vectors):
    """
	This is the nearest neighbour function that is run after the networks output. It translates
	the vector model back into words.
	"""
    possible_words = char2words(keystrokes)
    pred = []
    for i in range(len(vectors)):
        min_num = 0
        min_word = ""
        word_preds = []
        for word in possible_words[i]:
            dv = spacy(word).vector
            try:
                dist = np.dot(dv, vectors[i]) / (np.linalg.norm(dv) *
                                                 np.linalg.norm(vectors[i]))
            except:
                pass
            else:
                if min_num < dist:
                    min_num = dist
                    min_word = word
                    word_preds.append(word)
        pred.append(min_word)
    return pred
Exemple #2
0
def main():
    for play in modernPlays:
        startTime = time.time()

        print('Retrieving text for play: %s' % play)
        playText = retrievePlayText(play)
        print('Retrieved text, substituting pronouns')
        playText = substitutePronouns(playText, verbose=False)
        print('Substituted pronouns, resolving coreferences')
        playText = coreferenceResolve(playText, verbose=False)
        print('Coreferences resolved, parsing dependencies')
        playText = spacy(playText, verbose=False)
        print('Dependencies parsed, extracting relationships')
        relations = extractRelationships(playText, verbose=False)
        print('Relationships extracted, post processing triples')
        relations = postProcess(relations, play, verbose=False)
        print('Triples post processed, writing to DB')
        writeToDB(relations, play, verbose=False)
        print('Relations written to DB, writing relations to file')
        writeToFile(relations, play + outputFile, verbose=False)
        print('Relations written to file: %s' % play + outputFile)

        endTime = time.time()
        totalSeconds = endTime - startTime
        m, s = divmod(totalSeconds, 60)
        h, m = divmod(m, 60)

        print(
            'Done with %s, full pipeline took %d hours, %02d minutes, %02d seconds'
            % (play, h, m, s))
    return
def gettextfromvoice():
    file = request.files['file']
    r = sr.Recognizer()
    ext = file.filename.split('.')[1]
    if ext == 'mp3':
        file.save(os.path.join(app.config["AUDIO_UPLOADS"], "oldfile.mp3"))
        oldfile = os.path.join(app.config["AUDIO_UPLOADS"], "oldfile.mp3")
        newfile = os.path.join(app.config["AUDIO_UPLOADS"], "newfile.wav")
        subprocess.call(['ffmpeg', '-y', '-i', oldfile, newfile])
        harvard = sr.AudioFile(newfile)
    elif ext == 'mp4' or ext == 'avi':
        file.save(os.path.join(app.config["AUDIO_UPLOADS"], "oldfile." + ext))
        oldfile = os.path.join(app.config["AUDIO_UPLOADS"], "oldfile." + ext)
        newfile = os.path.join(app.config["AUDIO_UPLOADS"], "newfile.wav")
        clip = mp.VideoFileClip(oldfile)
        clip.audio.write_audiofile(newfile)
        harvard = sr.AudioFile(newfile)
    elif ext == 'wav':
        file.save(os.path.join(app.config["AUDIO_UPLOADS"], "oldfile.wav"))
        harvard = sr.AudioFile(os.path.join('./uploads/', "oldfile.wav"))
    else:
        return jsonify({"status": "failed", "message": "invalid file format"})
    with harvard as source:
        audio = r.record(source)
    textfromaudio = r.recognize_sphinx(audio)
    subject = spacy(textfromaudio)
    relevantNews = getrelevantNews(subject, textfromaudio)
    return jsonify({"data": relevantNews, "subject": subject})
def getspacy():
    somejsonfile = request.get_json()
    subject = spacy(somejsonfile['data'])
    # textblob(somejsonfile['data'])
    relevantNews = getrelevantNews(subject)
    print(relevantNews)
    return jsonify({"data": relevantNews, "subject": subject})
def process_html_files(contents):
    ABV = ''
    IBU = ''
    SRM = ''
    keywords = []
    specifications = []
    soup = BeautifulSoup(contents, 'html.parser')
    beer_style = str(
        soup.find_all('h3')).split('"recipeCuisine">')[1].split('</a></h3>')[0]
    if re.search('mead', beer_style, re.IGNORECASE):
        return None
    if re.search('cider', beer_style, re.IGNORECASE):
        return None
    recipe_name = str(soup.find('h3')).split('|')[0].replace('<h3>',
                                                             '').strip()
    if re.search('mead', recipe_name, re.IGNORECASE):
        return None
    if re.search('cider', recipe_name, re.IGNORECASE):
        return None
    recipe_ingredients = str(
        soup.find('div',
                  itemprop="ingredients").get_text()).strip().replace('|', '')
    recipe_ingredients_stemmed = stem_words(recipe_ingredients)
    recipe_ingredients_spacy = spacy(recipe_ingredients)
    recipe_instructions = soup.find('div',
                                    itemprop="recipeInstructions").get_text()
    recipe_specifications = soup.find(class_="specs").get_text()
    recipe_specifications = re.sub(r' ', '', recipe_specifications)
    recipe_specifications = re.sub(r'n/a', ' n/a ', recipe_specifications)
    recipe_specifications = re.sub(r'ABV', ' ABV ', recipe_specifications)
    recipe_specifications = re.sub(r'IBU', ' IBU ', recipe_specifications)
    recipe_specifications = re.sub(r'SRM', ' SRM ', recipe_specifications)
    recipe_specifications = re.sub(r'Boil', ' Boil ', recipe_specifications)
    recipe_specifications = re.sub(r'Efficiency', ' Efficiency',
                                   recipe_specifications)
    recipe_specifications = re.sub(r'byvolume', ' byvolume',
                                   recipe_specifications)
    specifications = recipe_specifications.split()
    if specifications:
        for idx, row in enumerate(specifications):
            if 'ABV' in row:
                ABV = specifications[idx + 1].replace(':', '').replace('%', '')
            if 'IBU' in row:
                IBU = specifications[idx + 1].replace(':', '')
            if 'SRM' in row:
                SRM = specifications[idx + 1].replace(':', '')
        if ABV and IBU and SRM:
            keywords.append(ABV)
            keywords.append(IBU)
            keywords.append(SRM)
            keywords.append(beer_style)
            load_mongo(recipe_name, beer_style, recipe_ingredients,
                       recipe_ingredients_stemmed, recipe_ingredients_spacy,
                       recipe_instructions, recipe_specifications, ABV, IBU,
                       SRM, keywords)
def quote(user_imput):
    if user_imput == "spacy":
        input_text = spacy()
    elif user_imput == "markov":
        input_text = markov()
    else:
        input_text = markov()

    meme = Meme(input_text)
    filename = meme.save()

    return filename
def gettextfromimage():
    file = request.files['image']
    file.save(os.path.join(app.config["IMAGE_UPLOADS"], file.filename))
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if file.filename.split('.')[1] == 'webp':
        image = Image.open(os.path.join('./uploads/',
                                        file.filename)).convert("RGB")
    else:
        image = Image.open(os.path.join('./uploads/', file.filename))
    textfromimage = pytesseract.image_to_string(image, lang='eng')
    subject = spacy(textfromimage)
    relevantNews = getrelevantNews(subject, textfromimage)
    return jsonify({"data": relevantNews, "subject": subject})
def split_sentence(text):
    '''
    Splits review into a list of sentences using spacy's sentence parser
    '''
    sentence = spacy(text)
    bag_sentence = []
    start = 0
    for token in sentence:
        if token.sent_start:
            bag_sentence.append(sentence[start:(token.i - 1)])
            start = token.i
        if token.i == len(sentence) - 1:
            bag_sentence.append(sentence[start:(token.i + 1)])
    return bag_sentence
def getrelevantNews(nouns, text):
    # googlenews.clear()
    # googlenews.search(nouns)
    # relevantNews = googlenews.result()
    # print(relevantNews)
    news = newsapi.get_everything(q=nouns, language='en')
    # news = requests.get("https://gnews.io/api/v3/search?q=nouns&token=911faa3568a07ec3606a958da028ee59")
    # if(text in news):
    # print(news["articles"])
    relevantNews = json.dumps(news["articles"])
    newsArticles = news["articles"]
    print(newsArticles)
    res = []
    userNews = nlp(spacy(text))
    for i in range(0, len(newsArticles)):
        articleNews = nlp(spacy(newsArticles[i]["description"]))
        # print(text)
        # print(newsArticles[i]["description"])
        # print(userNews.similarity(articleNews))
        # print("************")
        if userNews.similarity(articleNews) > 0.50:
            res.append(newsArticles[i])
    print(res)
    return json.dumps(res)
def get_embedding(string, comp, vecs, weights):
    if pd.isnull(string):
        return []
    else:
        if comp == "d":
            return d(string, vecs, weights)
        if comp == "new":
            return new(string, vecs, weights)
        if comp == "spacy":
            return spacy(string, vecs)
        if comp == "tags":
            return tags(string, vecs)
        if comp == "dilate":
            return decompose(string, vecs)
        x = []
        string = string.strip().split('_')
        if (len(string) == 1) and (string[0].strip() in vecs):
            return vecs.query(string[0])
        elif all(word.strip() in vecs for word in string):
            for word in string:
                x.append(vecs.query(word.strip()))
            if (comp == "multiply"):
                return multiply(x)
            elif (comp == "add"):
                return add(x)
            elif (comp == "lapata"):
                return lapata_combination(x)
            elif (comp == "decompose"):
                return decompose(x)
            elif (comp == "weight"):
                return weighted_add(x)
            elif (comp == "average"):
                average(x)
            elif (comp == "lapata_combination"):
                lapata_combination(x)
        else:
            return []
Exemple #11
0
def spacy_lemma(speech):
    doc = spacy(speech)
    return " ".join([token.lemma_ for token in doc])
    # df = df.replace(r'https?:\/\/.*[\r\n]*', "", regex = True)
    # df = df.replace(r'http\S+', "", regex = True)
    # df = df.replace(r'tinyurl\S+', "", regex = True)
    # df = df[:32810]

    # df.to_csv("tweets_csv.csv", index=False, sep = "|")

    bt = BotTweet(os.path.join("data", "trump_speeches.txt"))

    print("!!!", bt.make_short_tweet())
    return bt.make_short_tweet()


def quote(user_imput):
    if user_imput == "spacy":
        input_text = spacy()
    elif user_imput == "markov":
        input_text = markov()
    else:
        input_text = markov()

    meme = Meme(input_text)
    filename = meme.save()

    return filename


if __name__ == "__main__":
    word = spacy()
    print(word)
def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''

    counter_positive = collections.Counter()

    counter_negative = collections.Counter()

    sent_dict = Counter()

    sentence = spacy(sentence)

    for token in sentence:
        # print(sent_dict)

        #    print(token.text,token.dep_, token.head, token.head.dep_)
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:  # Words such as worked / crashed / well / useless / enjoyed
            #   print(token)

            # print(token.text, 'main_token')
            sentiment = 1 if token.text in pos else -1  # if word is in postive opinion words then add 1 - if in neg opinion words --> substract 1

            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                # print(token, 'advmod')
                continue
            elif (
                token.dep_ == "amod"):  # adjectical modifier --> amazing lightless of the laptop "amazing is the adjectical mod"

                sent_dict[token.head.text] += sentiment  # important --> amazing is the amod here. Therefore amazing.head = ligthless is added to the dict
                if sentiment > 0:
                    counter_positive[token.head.text] += sentiment
                elif sentiment < 0:
                    counter_negative[token.head.text] += sentiment

            # for opinion words that are adjectives, adverbs, verbs...
            else:

                for child in token.children:  # for example: issues has child many, which is an adjectival modifier
                    # print(child, 'child', child.dep_)
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    #  if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words): #does this have to be in opinion words
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")):  # does this have to be in opinion words
                        sentiment *= 1.5
                        # print(sentiment, token, 'token sentiment')
                    # check for negation words and flip the sign of sentiment  --> double negative e.g. not amazing
                    if child.dep_ == "neg":
                        sentiment *= -1
                        continue

                for child in token.children:

                    # if verb, check if there's a direct object --> lijdend voorwerp in dutch (enjoyed(verb) the keyboard light(direct object))
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):

                        sent_dict[child.text] += sentiment
                        if sentiment > 0:
                            counter_positive[token.head.text] += sentiment
                        elif sentiment < 0:
                            counter_negative[token.head.text] += sentiment

                        # check for conjugates (a AND b), then add both to dictionary
                        # Example: Enjoyed both the screen and the keyboard light
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj = 1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] += sentiment
                            if sentiment > 0:
                                counter_positive[token.head.text] += sentiment
                            elif sentiment < 0:
                                counter_negative[token.head.text] += sentiment

                # check for negation
                for child in token.head.children:
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"):
                        sentiment *= -1

                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):  # OS crashed repeatedly -->
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] += sentiment
                        if sentiment > 0:
                            counter_positive[token.head.text] += sentiment
                        elif sentiment < 0:
                            counter_negative[token.head.text] += sentiment

    # return sent_dict
    return counter_positive, counter_negative
def getspacy():
    somejsonfile = request.get_json()
    text = somejsonfile['data']
    subject = spacy(somejsonfile['data'])
    relevantNews = getrelevantNews(subject, text)
    return jsonify({"data": relevantNews, "subject": subject})
low_count = 0
low_counts = 0
tIoU_th_highest = 0.7
tIoU_th_high = 0.5
tIoU_th_mid = 0.3
tIoU_th_low = 0.1
tIoU_list = []
true_tIoU_list = []
gt_tIoU_list = []
with torch.no_grad():
    for i, test_data in enumerate(test_data_loader):
        # print('i:', i)
        sentence = test_data['sentence']
        for sen in sentence:
            sen = ''.join(map(str, sen))
            sen = spacy(sen)
            verb_list = []
            for token in sen:
                # verb_list.append(token.head.text)
                if token.pos_ == 'VERB':
                    verb_list.append(token.text)

        sentence = vocab.return_idx(sentence).to(device)
        verb = vocab.return_idx([verb_list]).to(device)
        verb = verb[:, :max_verb_len]

        proposed_videos = test_data['video']
        context_video = test_data['context_video']
        if len(proposed_videos.size()) == 4:
            proposed_videos = proposed_videos.squeeze(dim=0)
            context_video = context_video.squeeze(dim=0)
Exemple #16
0
    ### Train ###
    start_train = time.time()
    # print('epoch:', epoch)
    best_count = 0
    worst_count = train_batch_size
    model_cap.train()
    model_vid.train()
    model_grd.train()
    print('--- Train ---')
    for iter_num, batch_data in enumerate(train_data_loader):
        # print('epoch: {}, iter: {}'.format(epoch, iter_num))
        batch_sentence = batch_data['sentence']
        batch_verb_list = []
        for sentence in batch_sentence:
            sentence = ''.join(map(str, sentence))
            sentence = spacy(sentence)
            verb_list = []
            for token in sentence:
                # verb_list.append(token.head.text)
                if token.pos_ == 'VERB':
                    verb_list.append(token.text)
            batch_verb_list.append(verb_list)
        batch_sentence = vocab.return_idx(batch_sentence).to(device)
        batch_verb = vocab.return_idx(batch_verb_list).to(device)
        batch_verb = batch_verb[:, :max_verb_len]

        batch_video = batch_data['video']
        batch_context_video = batch_data['context_video']
        if isinstance(batch_video, list):
            batch_video = torch.cat([batch_video[i]
                                     for i in range(len(batch_video))], dim=0)