def clean_caption_biography(self): caption, biography = self.caption_biography() if biography: caption.append(biography) self.caption = caption else: self.caption = caption clean_cap_bio = [] for unclean_text in self.caption: if isinstance(unclean_text, unicode): # remove emoji, hashtag, url, html clean_text = ' '.join( re.sub( "([@#][A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", unclean_text).split()) clean_text = " ".join( singularize(w).lower() for w in nltk.wordpunct_tokenize(clean_text) if singularize(w).lower() in words and w.isalpha()) clean_cap_bio.append(clean_text) else: clean_cap_bio = [] self.documents = clean_cap_bio return self.documents
def get_synset(word, pos=''): synset = None try: if pos: synset = wordnet.synsets(singularize(lemmatise(word)), w[pos])[0] else: synset = wordnet.synsets(singularize(lemmatise(word)))[0] except IndexError: try: if pos: synset = wordnet.synsets(lemmatise(word), w[pos])[0] else: synset = wordnet.synsets(lemmatise(word))[0] except IndexError: try: if pos: synset = wordnet.synsets(singularize(word), w[pos])[0] else: synset = wordnet.synsets(singularize(word))[0] except IndexError: try: if pos: synset = wordnet.synsets(word, w[pos])[0] else: synset = wordnet.synsets(word)[0] except IndexError: pass if pos and synset is None: return get_synset(word) return synset
def searchScore(projectDescriptions, tools_materials): if len(tools_materials) == 0: return scoreList = [] for projDesc in projectDescriptions: descEntities = getEntities(projDesc.lower()) score = 0.0 for tm in tools_materials: for descEnt in descEntities: if singularize(tm.lower()) == singularize(descEnt.name): score += 1 score = score / len(tools_materials) scoreList.append(score) return scoreList
def main(): for i in range(len(list1)): for label in [list1[i]]: for g in wn.synsets(label): #get all gloss definiation deftoken=g.definition() # print(deftoken) #get all tokenize tkn=word_tokenize(deftoken) for w in tkn: # remove stop words if w not in stop_words and len(w)>1: #b. Convert all plural nouns form (irregular and regular) to singular noun form #using pattern library in python. token=singularize(w) token = ''.join(i for i in token if not i.isdigit()) token=token.lower() token=WordNetLemmatizer().lemmatize(token,'v') # # adjective: pos='a' token=WordNetLemmatizer().lemmatize(token,'a') # adverb: pos='r' token=WordNetLemmatizer().lemmatize(token,'r') wordFromList1 = wn.synsets(label) wordFromList2 = wn.synsets(token) if wordFromList1 and wordFromList2: s = wordFromList1[0].path_similarity(wordFromList2[0]) if s != None and s > 0.001: #save all ontologies in a txt file print(label,'~',token,'=',s,'.', file=data)
def wikifilter(keyword): wiki_wiki = wikipediaapi.Wikipedia('en') candidate = {} for key in keyword.keys(): page_py = wiki_wiki.page(key) if page_py.exists() == True: candidate[key] = keyword[key] elif page_py.exists() == False: singles = singularize(key) page_py = wiki_wiki.page(singles) if page_py.exists() == True: candidate[singles] = keyword[key] # print(candidate) final = {} redirect = {} relation = {} for ca in candidate: query = requests.get( r'https://en.wikipedia.org/w/api.php?action=query&titles={}&&redirects&format=json' .format(ca)) data = json.loads(query.text) PAGES = data["query"]["pages"] for v in PAGES.values(): redirect[ca] = v["title"] relation[v["title"]] = ca final[v["title"]] = 0 for ca in redirect.keys(): final[redirect[ca]] = candidate[ca] # print(final) return relation, final
def has_one(model_name, **relation_kwargs): """Connect a parent table to a child in a one-to-one relation. class Parent: child = relationship("Child", uselist=False, back_populates="parent") """ caller_namespace, db, table_name, caller_table_name, caller_class_name = rip_context_info( model_name) variable_name = table_name caller_namespace[f'{variable_name}'] = db.relationship( f'{model_name}', back_populates=f'{caller_table_name}', uselist=False, **relation_kwargs, ) try: callback = global_relations[caller_table_name][table_name] except KeyError: callback = None if callback: back_populates = singularize(table_name) callback(back_populates)
def _map_categories(category_id, sativa, indica, data, menu_items): """ If the menu item that are % indica and % sativa. If > indica threshold, it goes into indica, if > sativa threshold it goes into sativa, if neither it goes into hybrid. The other conditions within this will map to G1's naming convention, i.e: MMJ Drinks => G1 Drink """ category = data.keys()[data.values().index(category_id)] if category.lower() == 'cannabis': if sativa > 0 and indica > 0: if sativa > 80: return 'Sativa' if indica > 80: return 'Indica' else: return 'Hybrid' if category.lower() == 'paraphernalia': return 'Gear' if category.lower() == 'tincture': return 'Tinctures' if category.lower() == 'prerolled': return 'Preroll' if category in PLURAL_CATEGORIES: return singularize(category) if category not in CAT_MAP: return 'Other' return category
def body(self): for c in self.columns: if c[0] == 'id': continue self.document += "\t$" + c[0] + " = $_POST['" + c[0] + "'];\n" self.document += "\n" self.document += "\t$" + self.connectorName + " = new " + self.connectorName + "($conn);\n\n" self.document += "\tif(!$" + self.connectorName + "->create(" first = True for c in self.columns: if c[0] == 'id': continue if not first: self.document += ", " first = False self.document += "$" + c[0] self.document += ")) {\n" self.document += "\t\t$response['success'] = false;\n" self.document += "\t\t$response['message'] = \"Failed to create " + singularize( self.table) + "!\";\n" self.document += "\t}\n" self.document += "\telse {\n" self.document += "\t\t$response['success'] = true;\n" self.document += "\t}\n\n" self.document += "\techo(json_encode($response));\n" self.document += "?>"
def stem(self, word): token = singularize(word) conjugation = conjugate(token, 'inf') if conjugation: token = str(conjugation) return token
def get_synset(word, wpos): synset = None try: synset = wordnet.synsets(singularize(lemmatise(word)), wpos)[0] except IndexError: pass return synset
def scrape_tools(self): tools = [] with open(self.page) as fp: line = fp.readline() while line: tools.append(singularize(line.strip().lower())) line = fp.readline() return tools
def __init__(self, t): self.connectorName = singularize(t.title()) + "Connector" self.table = t self.document = "<?php\n" self.includes() self.body()
def body(self): self.document += "\t$id = $_POST['id'];\n\n" self.document += "\t$" + self.connectorName + " = new " + self.connectorName + "($conn);\n\n" self.document += "\t$response['" + singularize( self.table) + "'] = $" + self.connectorName + "->delete($id);\n" self.document += "\t$response['success'] = true;\n\n" self.document += "\techo(json_encode($response));\n" self.document += "?>"
def wordOrPluralInList(checkList, referenceList, i): import pattern from pattern.text.en import pluralize from pattern.text.en import singularize if checkList[i] in referenceList or pluralize( checkList[i]) in referenceList or singularize( checkList[i]) in referenceList: return True
def parse(pklfile, words, modify_dict): good_words = [] with open(pklfile,'rb') as pkl: x = pickle.load(pkl) final_transcript = [] for result in x.results: alts = result.alternatives[:5] for _, alt in enumerate(alts): trans = alt.transcript raw_words = [x.lower() for x in trans.split(' ') if x] good_words.extend([x for x in raw_words if x in words]) good_words.extend([remove_nonalpha(x) for x in raw_words if remove_nonalpha(x) in words]) good_words.extend([singularize(x) for x in raw_words if singularize(x) in words]) good_words.extend([singularize(remove_nonalpha(x)) for x in raw_words if singularize(remove_nonalpha(x)) in words]) good_words.extend([modify_match(x,modify_dict,1) for x in raw_words if modify_match(x, modify_dict,1) in words]) good_words = set(good_words) return good_words
def similar(text1, text2): found = False count = 0 text1 = text1.split(' ') text2 = text2.split(' ') # ---------------------------------------------- # Singularizing the words text1 = [singularize(word) for word in text1] # Singularizing the words text2 = [singularize(word) for word in text2] # ---------------------------------------------- #Conerting in lower case text1 = [word.lower() for word in text1] #Conerting in lower case text2 = [word.lower() for word in text2] # ---------------------------------------------- # Converting into present form text1 = [WordNetLemmatizer().lemmatize(word, 'v') for word in text1] # Converting into present form text2 = [WordNetLemmatizer().lemmatize(word, 'v') for word in text2] for word in text1: if len(word) > 3: if singularize(word) in text2: count += 1 else: for syn in wordnet.synsets(word): for name in set(syn.lemma_names()): if name in text2: count += 1 found = True break if found: found = False break return count
def sort(df, dhlw): # Used to store our cleaned subject data cleaned_data = pd.DataFrame(columns=['doi', 'subjects', 'title']) cleaned_data_filename = 'data/tru_cleaned.csv' if dhlw: cleaned_data_filename = 'data/dhlw_cleaned.csv' blank_subjects = 0 # number that OSTI listed as blank... removed_subjects = 0 # number of subjects that were all digits, dots, *, -, and whitespaces #p = nltk.PorterStemmer() for i, r in df.iterrows(): subjects_str = r['subjects'] if not pd.isnull(subjects_str): subjects = subjects_str.split(";") cleaned_subjects = [] for s in subjects: cleaned_s = s.lower().strip( ) # first cleans by removing whitespace and then putting it all to lowercase cleaned_s = cleaned_s.lstrip( '0123456789.-* ' ) # removes all digits, dots, dashes, and spaces from the start if cleaned_s != "": # converts the last word in the subject to be singular cleaned_s_words = cleaned_s.split(" ") cleaned_s_words[len(cleaned_s_words) - 1] = singularize( cleaned_s_words[len(cleaned_s_words) - 1]) cleaned_s = " ".join(cleaned_s_words) subject_counts[cleaned_s] += 1 cleaned_subjects.append(cleaned_s) else: if s == "": blank_subjects += 1 else: removed_subjects += 1 subjects_str = ';'.join(cleaned_subjects) else: subjects_str = "" cleaned_data = cleaned_data.append(pd.DataFrame( { 'title': r['title'], 'doi': r['doi'], 'subjects': subjects_str }, index=[0]), ignore_index=True) cleaned_data.to_csv(cleaned_data_filename, sep='|') print("Blank subjects: " + str(blank_subjects)) print("Removed subjects: " + str(removed_subjects))
def wordOrPluralInListNoLoop(checkList, referenceList): import pattern from pattern.text.en import pluralize from pattern.text.en import singularize if checkList in referenceList or pluralize( checkList) in referenceList or singularize( checkList) in referenceList: return True else: return False
def scrape_seafood(self): seafood = [] fullHTML = self.soup.find_all('li') for x in range(11, 98): seafood.append( singularize( re.sub('\(.*?\)', '', fullHTML[x].text.strip().encode( 'utf-8').lower()).strip())) return seafood
def preText(text, pos_bow, neg_bow): # parameter: # text : takes a sentence, string # pos_bow: positive bag of words, list # neg_bow: negative bag of words, list # return: # score_word_sim : similarity score for all verbs, float # score_bow: score for bag of words implementation, float # recognize verb pattern pattern = [{ "POS": "VERB", "OP": "*" }, { "POS": "ADV", "OP": "*" }, { "POS": "VERB", "OP": "+" }, { "POS": "PART", "OP": "*" }] # extract verb pattern doc = textacy.make_spacy_doc(text, lang='en_core_web_lg') verbs = textacy.extract.matches(doc, pattern) score_word_sim = 0.0 score_bow = 0.0 for verb in verbs: # singularize verb, e.g. "likes" to "like" singularized_verb = singularize(verb.text) score_word_sim += wordSimilarity(pos_bow, neg_bow, singularized_verb) # apply bag of words to the singularized verb score_bow += pos_bow.count(str(singularized_verb)) score_bow -= neg_bow.count(str(singularized_verb)) # aggregate all verb similarity if score_word_sim > 0.5: score_word_sim = 1.0 elif score_word_sim < -0.5: score_word_sim = -1.0 else: score_word_sim = 0.0 # aggregate the count with bag of words if score_bow > 0.5: score_bow = 1.0 elif score_bow < -0.5: score_bow = -1.0 else: score_bow = 0.0 return score_word_sim, score_bow
def Monitor_DeleteNonEnglishWords(sentence): # We will clean the sentence from non-english words nltk.download('words') words = set(nltk.corpus.words.words()) sentence = " ".join( w for w in nltk.wordpunct_tokenize(sentence) if singularize(w).lower() in words or not w.isalpha()) if sentence[-1] != '.': sentence += '.' return sentence
def scrape_meats(self): meats = [] article = self.soup.find_all('article', {"class": "tag-animal-protein-list"}) fullHTML = [] for art in article: fullHTML = art.find_all("li") for x in range(0, len(fullHTML)): meats.append(fullHTML[x].text.strip().encode('utf-8').lower()) self.meats = meats meats = [singularize(m.strip()) for m in meats] return meats
def normalize(string): plurale_tantum = ['this', 'yes', 'pants', 'shorts', 'glasses', 'scissors', 'panties', 'trousers', 'binoculars', 'pliers', 'tongs',\ 'tweezers', 'forceps', 'goggles', 'jeans', 'tights', 'leggings', 'chaps', 'boxers', 'indoors', 'outdoors', 'bus', 'octapus', 'waitress',\ 'pasta', 'pita', 'glass', 'asparagus', 'hummus', 'dress', 'cafeteria', 'grass', 'class'] irregulars = {'shelves': 'shelf', 'bookshelves': 'bookshelf', 'olives': 'olive', 'brownies': 'brownie', 'cookies': 'cookie'} temp = string.strip().lower() if temp in irregulars: return irregulars[temp] return temp if temp.split(' ')[-1] in plurale_tantum or temp[-2:] == 'ss' else singularize(temp)
def get_synset(phrase): synset = None for word, pos in tag(phrase): if pos.startswith('N') and word != 'of': try: synset = wordnet.synsets(singularize(lemmatise(word)))[0] except IndexError: try: synset = wordnet.synsets(lemmatise(word))[0] except IndexError: try: synset = wordnet.synsets(singularize(word))[0] except IndexError: try: synset = wordnet.synsets(word)[0] except IndexError: logging.error("Failed to find synset for '" + word + "'") continue elif pos == 'PRP': return wordnet.synsets('living thing')[0] return synset
def processIngredients(ingredient): results = [] parsed = [singularize(e) for e in sent_parse(ingredient)] for element in parsed: # exact match if element in foodList: results += [element] # split current word and exact match each sub-word else: for w in element.split(' '): if w in foodList: results += [w] return list(set(results))
def wikifilter(keyword): wiki_wiki = wikipediaapi.Wikipedia('en') candidate = {} final = {} redirect = {} relation = {} for key in keyword.keys(): page_py = wiki_wiki.page(key) if page_py.exists() == True: query = requests.get( r'https://en.wikipedia.org/w/api.php?action=query&titles={}&&redirects&format=json' .format(key)) data = json.loads(query.text) PAGES = data["query"]["pages"] #print(PAGES) for v in PAGES.values(): redirect[key] = v["title"] #print(redirect) temp_list = relation.get(v["title"], []) temp_list.append(key) relation[v["title"]] = temp_list #print(relation) final[v["title"]] = 0 elif page_py.exists() == False: singles = singularize(key) page_py = wiki_wiki.page(singles) if page_py.exists() == True: query = requests.get( r'https://en.wikipedia.org/w/api.php?action=query&titles={}&&redirects&format=json' .format(singles)) data = json.loads(query.text) PAGES = data["query"]["pages"] #print(PAGES) for v in PAGES.values(): redirect[key] = v["title"] #print(redirect) temp_list = relation.get(v["title"], []) temp_list.append(key) relation[v["title"]] = temp_list # print(relation) final[v["title"]] = 0 for k in redirect.keys(): final[redirect[k]] = final[redirect[k]] + keyword[k] # print(final) return relation, final
def convert_to_ml(unit, conversion): """ Convert the unit, usually in volume, to a weight so we can calculate the fraction of ingredient present/ """ unit = singularize(unit.lower()) qty_ml = 0 try: qty_ml = conversion[unit] except: qty_ml = 1 return qty_ml
def clean_words(description): words = re.split("[/ ]+", str(description)) keywords = [] for word in words: word = word.lower() if word not in stop_words: word = re.sub('[^A-Za-z]+', '', word) if word is not '': word = WordNetLemmatizer().lemmatize(word, 'v') word = singularize(word) word = spell(word) keywords.append(str(word)) return keywords
def _validate(self, clue: str, positiveWords: np.array, negativeWords: np.array) -> bool: clue = clue.lower() invalidWords: np.array = np.append( self.previousClues, np.append(positiveWords, negativeWords)) stemmedClue: str = stem(clue) singularClue: str = singularize(clue) pluralClue: str = pluralize(clue) if not clue.isalpha() or not clue.isascii() or set( "aeiouy").isdisjoint(clue) or not 2 <= len(clue) <= 12: return False for word in invalidWords: stemmedWord = stem(word) singularWord = singularize(word) pluralWord = pluralize(word) if clue in word or word in clue or stemmedClue in word or stemmedWord in clue or \ singularClue in word or singularWord in clue or pluralClue in word or pluralWord in clue: return False return True
def get_plural_singular_name(name): d = {} for word in name.split(): val = [] singular = singularize(word.lower()) val.extend([singular]) val.extend([word.lower()]) d[word] = (list(set(val))) name_list = [] for combination in product(*d.values()): name_list.append(' '.join(combination)) return name_list
def convert_postag(complex_word, candidates): specific_tag = NLP.pos_tag(complex_word)[0][1] generic_tag = get_type(specific_tag) # print(generic_tag) final_candidates = set() if generic_tag == "NN": ### Nouns # print(generic_tag) for candidate in candidates: candidate_tag = NLP.pos_tag(candidate)[0][1] if specific_tag == "NNS" and candidate_tag != "NNS": candidate = pluralize(candidate) # print("pluraaal ", candidate) elif specific_tag == "NN" and candidate_tag == "NNS": candidate = singularize(candidate) # print("singulaaar" , candidate) # print("wwilll add") final_candidates.add(candidate) elif generic_tag == "ADJ": ## Adjectives for candidate in candidates: candidate_tag = NLP.pos_tag(candidate)[0][1] if specific_tag == "JJR" and candidate_tag != "JJR": candidate = comparative(candidate) # print(candidate , "jjr") elif specific_tag == "JJS" and candidate_tag != "JJS": # print(candidate , "jjs") candidate = superlative(candidate) # print(candidate , "added") final_candidates.add(candidate) elif generic_tag == "VB": ## Verbs complex_tense = tenses(complex_word) if (len(complex_tense)) < 1: return candidates for candidate in candidates: # print("my tense" , complex_tense.upper() ," candidate " , candidate , " ", tenses(candidate)[0][0] ) if len(tenses(candidate)) > 0 and tenses( candidate)[0][0] != complex_tense: if complex_tense == "past": candidate = conjugate(candidate, tense=PAST) elif complex_tense == "present": candidate = conjugate(candidate, tense=PRESENT) elif complex_tense == "future": candidate = conjugate(candidate, tense=FUTURE) elif complex_tense == "infinitive": candidate = conjugate(candidate, tense=INFINITIVE) final_candidates.add(candidate) else: final_candidates = candidates return final_candidates
def prepare_text(input): sentences = nltk.sent_tokenize(input) sentences = [nltk.word_tokenize(sent) for sent in sentences] for i in range(len(sentences)): for j in range(len(sentences[i])): sentences[i][j] = de_nonletter.sub('', sentences[i][j].lower()) sentences = [deleteEmpty(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] for sentence in sentences: for i in range(len(sentence)): if sentence[i][1] == 'NNS': tlst = list(sentence[i]) tlst[0] = singularize(tlst[0]) tlst[1] = 'NN' sentence[i] = tuple(tlst) sentences = [NPChunker.parse(sent) for sent in sentences] return sentences
def get_best_unit(ingred_list, conversion): """ Some ingredients, have different options like 1 pkg (7.5 oz) the code will identify both, this function tells which one to use """ for ingred in ingred_list: # print(ingred) inspect = ingred.strip().split(' ') # print(inspect[1].lower()) new_unit = singularize(inspect[1].lower()) if new_unit in list(conversion.keys()): return ingred # could not find it in the list use the first one return ingred_list[0]
def inject_nn_error(example): tok_sentence, features_seq = example[0].split(), example[1] noun_indices = [] for i in range(len(features_seq)): if features_seq[i]['universal_postag'] == 'NOUN': noun_indices.append(i) if len(noun_indices) > 0: j = random.choice(noun_indices) singular_form = pattern.singularize(tok_sentence[j]) plural_form = pattern.pluralize(tok_sentence[j]) if tok_sentence[j] != singular_form: tok_sentence[j] = singular_form elif tok_sentence[j] != plural_form: tok_sentence[j] = plural_form else: return None return [' '.join(tok_sentence), features_seq]
def create_characters(dependencies): characters = [] for dependency in dependencies: cpostag = dependency['CPOSTAG'] if not (cpostag.startswith('N') or cpostag.startswith('PR')): dependency['CHARACTER_ID'] = '' continue global NEXT_CHARACTER_ID dependency['CHARACTER_ID'] = str(NEXT_CHARACTER_ID) NEXT_CHARACTER_ID += 1 form = dependency['FORM'] words = form.split(' ') gender = '' object_state = '' if dependency['CPOSTAG'].startswith('P'): if set(words) & PLURAL_PRONOUNS: num = 'pl' else: num = 'sg' if set(words) & MALE_PRONOUNS: gender = 'm' elif set(words) & FEMALE_PRONOUNS: gender = 'f' elif set(words) & NEUTRAL_PRONOUNS: gender = 'n' if not gender == 'n': object_state = 'a' else: if dependency['POSTAG'].endswith('S'): num = 'pl' else: num = 'sg' try: synset = wordnet.synsets(singularize(lemmatise(words[-1])))[0] except IndexError: try: synset = wordnet.synsets(lemmatise(words[-1]))[0] except IndexError: try: synset = wordnet.synsets(singularize(words[-1]))[0] except IndexError: try: synset = wordnet.synsets(words[-1])[0] except IndexError: logging.error("Failed to find synset for '" + words[-1] + "'") continue hyps = set() for h in synset.hypernyms(recursive=True): try: hyps.add(h.gloss) except ValueError: continue hyps.add(synset.gloss) object_state = determine_object_state(hyps) gender = determine_gender(hyps, object_state) character = Character(dependency['ID'], num, gender, object_state) character.text = form if dependency['POSTAG'].startswith('P') and not set(words) & NON_RESOLUTION_PRONOUNS: character.is_pronoun = True character.add_relation("IsA", form) characters.append(character) return characters