def construct_What(question, pos, parent_pos, parent_ner, parent_dep_tree, thisNP): prps = ["he", "she", "they", "we", "i"] return_Str = "" full_NP_ners, word_NP_ners = getExhaustiveNERs(thisNP) lower_NP_tokens = thisNP.lower().split(" ") # print(lower_NP_tokens) subj = findSubj(parent_dep_tree) # print("subj", subj) subj_ner = findSubjNER(parent_ner, subj) # print("subj", subj, "subjNER", subj_ner) if pos[0][1] in ["VBD", "VBZ", "NNS", "VBP", "NNP"]: # print(full_NP_ners) if any(x in lower_NP_tokens for x in prps) or "PERSON" in full_NP_ners or "PERSON" == subj_ner: return_Str += "Who " + getDecapitalized(question) else: return_Str += "What " + getDecapitalized(question) elif pos[0][1] in ["VB"]: tokens = word_tokenize(getDecapitalized(question)) # print(tokens,"YOYOYO") verb = wd(tokens[0]).pluralize() + " " # print(full_NP_ners) if verb == "haves": verb = "have" if any(x in lower_NP_tokens for x in prps) or "PERSON" in full_NP_ners or "PERSON" == subj_ner: return_Str += "Who " + verb + " ".join(tokens[1:]) else: return_Str += "What " + verb + " ".join(tokens[1:]) elif pos[0][1] in ["VBN", "VBG"]: # print(full_NP_ners) if any(x in lower_NP_tokens for x in prps) or "PERSON" in full_NP_ners or "PERSON" == subj_ner: return_Str += "Who " + getDecapitalized(question) else: return_Str += "What is " + getDecapitalized(question) if return_Str != "": return_Str = re.sub(' +', ' ', return_Str).strip() que_tokens = word_tokenize(return_Str) if que_tokens[-1] == "." or que_tokens[-1] == ",": que_tokens[-1] = "?" else: que_tokens.append("?") return " ".join(que_tokens) else: return None
if os.path.isdir(abs_path): yelp_category = file_or_dir pos_seeds_no_lemma = pos_neg_words_dict_per_category.get( yelp_category, {}).get("pos", None) neg_seeds_no_lemma = pos_neg_words_dict_per_category.get( yelp_category, {}).get("neg", None) if not (pos_seeds_no_lemma or neg_seeds_no_lemma): print( "Seeds not defined in socialconfig.py for {c} category, ignoring" .format(c=yelp_category)) continue pos_seeds = [] neg_seeds = [] for seed in pos_seeds_no_lemma: pos_seeds.append(wd(seed).lemmatize("a")) for seed in neg_seeds_no_lemma: neg_seeds.append(wd(seed).lemmatize("a")) for bdir, subdirs, files in os.walk(abs_path): if len(files) > 0: # any file discovered here should be an embeddings file for file in files: if file[0] != "." and "word2vec_GIGA_Embeddings_yelp" in file and file[ -4:] == ".txt": # we found an embedding file embedding_abs_file_path = os.path.join(bdir, file) print("Using embeddings in {f}".format( f=embedding_abs_file_path))
def process(self): # directory where reviews are kept SAVE_REVIEWS_DIRECTORY = config.get("SAVE_REVIEWS_DIRECTORY", None) SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get( "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None) PROCESS_N_REVIEWS_ONLY = int( config.get("PROCESS_N_REVIEWS_ONLY", 1000000)) print( "Will process only {num} reviews as per the directive in the socialconfig.py" .format(num=str(PROCESS_N_REVIEWS_ONLY))) if not (SAVE_REVIEWS_DIRECTORY and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): print( "config keys are not set correctly in the config file: socialconfig.py" ) exit(0) if not os.path.exists( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): os.makedirs(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) bars_file_counter, food_file_counter, grooming_file_counter, learn_file_counter, leisure_file_counter, municipal_file_counter, planning_file_counter, services_file_counter, shopping_file_counter, sports_file_counter, health_file_counter, other_file_counter = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 bars_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "bars") if not (os.path.exists(bars_dir) and os.path.isdir(bars_dir)): os.makedirs(bars_dir) bars_file_path = os.path.join(bars_dir, "yelp_reviews_bars_1.txt") food_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "food") if not (os.path.exists(food_dir) and os.path.isdir(food_dir)): os.makedirs(food_dir) food_file_path = os.path.join(food_dir, "yelp_reviews_food_1.txt") grooming_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "grooming") if not (os.path.exists(grooming_dir) and os.path.isdir(grooming_dir)): os.makedirs(grooming_dir) grooming_file_path = os.path.join(grooming_dir, "yelp_reviews_grooming_1.txt") learn_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "learn") if not (os.path.exists(learn_dir) and os.path.isdir(learn_dir)): os.makedirs(learn_dir) learn_file_path = os.path.join(learn_dir, "yelp_reviews_learn_1.txt") leisure_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "leisure") if not (os.path.exists(leisure_dir) and os.path.isdir(leisure_dir)): os.makedirs(leisure_dir) leisure_file_path = os.path.join(leisure_dir, "yelp_reviews_leisure_1.txt") municipal_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "municipal") if not (os.path.exists(municipal_dir) and os.path.isdir(municipal_dir)): os.makedirs(municipal_dir) municipal_file_path = os.path.join(municipal_dir, "yelp_reviews_municipal_1.txt") planning_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "planning") if not (os.path.exists(planning_dir) and os.path.isdir(planning_dir)): os.makedirs(planning_dir) planning_file_path = os.path.join(planning_dir, "yelp_reviews_planning_1.txt") services_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "services") if not (os.path.exists(services_dir) and os.path.isdir(services_dir)): os.makedirs(services_dir) services_file_path = os.path.join(services_dir, "yelp_reviews_services_1.txt") shopping_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "shopping") if not (os.path.exists(shopping_dir) and os.path.isdir(shopping_dir)): os.makedirs(shopping_dir) shopping_file_path = os.path.join(shopping_dir, "yelp_reviews_shopping_1.txt") sports_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "sports") if not (os.path.exists(sports_dir) and os.path.isdir(sports_dir)): os.makedirs(sports_dir) sports_file_path = os.path.join(sports_dir, "yelp_reviews_sports_1.txt") health_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "health") if not (os.path.exists(health_dir) and os.path.isdir(health_dir)): os.makedirs(health_dir) health_file_path = os.path.join(health_dir, "yelp_reviews_health_1.txt") other_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "other") if not (os.path.exists(other_dir) and os.path.isdir(other_dir)): os.makedirs(other_dir) other_file_path = os.path.join(other_dir, "yelp_reviews_other_1.txt") bars_file = open(bars_file_path, 'w') food_file = open(food_file_path, 'w') grooming_file = open(grooming_file_path, 'w') learn_file = open(learn_file_path, 'w') leisure_file = open(leisure_file_path, 'w') municipal_file = open(municipal_file_path, 'w') planning_file = open(planning_file_path, 'w') services_file = open(services_file_path, 'w') shopping_file = open(shopping_file_path, 'w') sports_file = open(sports_file_path, 'w') health_file = open(health_file_path, 'w') other_file = open(other_file_path, 'w') review_counter = 0 for file in os.listdir(SAVE_REVIEWS_DIRECTORY): if not os.path.isdir(file): abs_file_path = os.path.join(SAVE_REVIEWS_DIRECTORY, file) if "yelp_reviews_" in file: reviews = get_iterable(abs_file_path) for review in reviews: review_counter += 1 review_dict = ujson.loads(review) business_id = review_dict.get("business_id", None) review_text = review_dict.get("review_text", None) wiki = tb(review_text) tags = wiki.tags adj_words_list = [ wd(fword).lemmatize("a") for fword in [ self.strip_special_chars(word.lower()) for word, tag in tags if tag in ["JJ", "JJR", "JJS"] and len(word) > 2 ] if len(fword) > 2 ] adj_list_string = " ".join(adj_words_list) review_dict.update({"adjectives": adj_list_string}) write_line = ujson.dumps(review_dict) cat_list = self.business_to_cat.get(business_id, []) for cat in cat_list: if cat == "bars": bars_file.write(write_line + "\n") bars_file_counter += 1 if bars_file_counter % 25000 == 0: del bars_file old_bars_file = bars_file_path new_bars_file = os.path.join( bars_dir, "yelp_reviews_bars_{c}.txt".format( c=str( int((bars_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `BARS` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_bars_file} to {new_bars_file}' .format(old_bars_file=old_bars_file, new_bars_file=new_bars_file)) bars_file = open(new_bars_file, 'w') elif cat == "food": food_file.write(write_line + "\n") food_file_counter += 1 if food_file_counter % 25000 == 0: del food_file old_food_file = food_file_path new_food_file = os.path.join( food_dir, "yelp_reviews_food_{c}.txt".format( c=str( int((food_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `food` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_food_file} to {new_food_file}' .format(old_food_file=old_food_file, new_food_file=new_food_file)) food_file = open(new_food_file, 'w') elif cat == "grooming": grooming_file.write(write_line + "\n") grooming_file_counter += 1 if grooming_file_counter % 25000 == 0: del grooming_file old_grooming_file = grooming_file_path new_grooming_file = os.path.join( grooming_dir, "yelp_reviews_grooming_{c}.txt".format( c=str( int((grooming_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `grooming` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_grooming_file} to {new_grooming_file}' .format( old_grooming_file=old_grooming_file, new_grooming_file=new_grooming_file )) grooming_file = open( new_grooming_file, 'w') elif cat == "learn": learn_file.write(write_line + "\n") learn_file_counter += 1 if learn_file_counter % 25000 == 0: del learn_file old_learn_file = learn_file_path new_learn_file = os.path.join( learn_dir, "yelp_reviews_learn_{c}.txt".format( c=str( int((learn_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `learn` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_learn_file} to {new_learn_file}' .format(old_learn_file=old_learn_file, new_learn_file=new_learn_file)) learn_file = open(new_learn_file, 'w') elif cat == "leisure": leisure_file.write(write_line + "\n") leisure_file_counter += 1 if leisure_file_counter % 25000 == 0: del leisure_file old_leisure_file = leisure_file_path new_leisure_file = os.path.join( learn_dir, "yelp_reviews_leisure_{c}.txt".format( c=str( int((leisure_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `leisure` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_leisure_file} to {new_leisure_file}' .format( old_leisure_file=old_leisure_file, new_leisure_file=new_leisure_file)) leisure_file = open(new_leisure_file, 'w') elif cat == "municipal": municipal_file.write(write_line + "\n") municipal_file_counter += 1 if municipal_file_counter % 25000 == 0: del municipal_file old_municipal_file = municipal_file_path new_municipal_file = os.path.join( municipal_dir, "yelp_reviews_municipal_{c}.txt". format(c=str( int((municipal_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `municipal` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_municipal_file} to {new_municipal_file}' .format(old_municipal_file= old_municipal_file, new_municipal_file= new_municipal_file)) municipal_file = open( new_municipal_file, 'w') elif cat == "planning": planning_file.write(write_line + "\n") planning_file_counter += 1 if planning_file_counter % 25000 == 0: del planning_file old_planning_file = planning_file_path new_planning_file = os.path.join( planning_dir, "yelp_reviews_planning_{c}.txt".format( c=str( int((planning_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `planning` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_planning_file} to {new_planning_file}' .format( old_planning_file=old_planning_file, new_planning_file=new_planning_file )) planning_file = open( new_planning_file, 'w') elif cat == "services": services_file.write(write_line + "\n") services_file_counter += 1 if services_file_counter % 25000 == 0: del services_file old_services_file = services_file_path new_services_file = os.path.join( services_dir, "yelp_reviews_services_{c}.txt".format( c=str( int((services_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `services` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_services_file} to {new_services_file}' .format( old_services_file=old_services_file, new_services_file=new_services_file )) services_file = open( new_services_file, 'w') elif cat == "shopping": shopping_file.write(write_line + "\n") shopping_file_counter += 1 if shopping_file_counter % 25000 == 0: del shopping_file old_shopping_file = shopping_file_path new_shopping_file = os.path.join( shopping_dir, "yelp_reviews_shopping_{c}.txt".format( c=str( int((shopping_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `shopping` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_shopping_file} to {new_shopping_file}' .format( old_shopping_file=old_shopping_file, new_shopping_file=new_shopping_file )) shopping_file = open( new_shopping_file, 'w') elif cat == "sports": sports_file.write(write_line + "\n") sports_file_counter += 1 if sports_file_counter % 25000 == 0: del sports_file old_sports_file = sports_file_path new_sports_file = os.path.join( sports_dir, "yelp_reviews_sports_{c}.txt".format( c=str( int((sports_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `sports` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_sports_file} to {new_sports_file}' .format( old_sports_file=old_sports_file, new_sports_file=new_sports_file)) sports_file = open(new_sports_file, 'w') elif cat == "health": health_file.write(write_line + "\n") health_file_counter += 1 if health_file_counter % 25000 == 0: del health_file old_health_file = health_file_path new_health_file = os.path.join( health_dir, "yelp_reviews_health_{c}.txt".format( c=str( int((health_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `health` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_health_file} to {new_health_file}' .format( old_health_file=old_health_file, new_health_file=new_health_file)) health_file = open(new_health_file, 'w') else: other_file.write(write_line + "\n") other_file_counter += 1 if other_file_counter % 25000 == 0: del other_file old_other_file = other_file_path new_other_file = os.path.join( other_dir, "yelp_reviews_other_{c}.txt".format( c=str( int((other_file_counter + 1) / 25000) + 1))) print( "25000 Reviews collected in `other` category, creating a new file to keep file sizes manageable" ) print( 'Changing file from : {old_other_file} to {new_other_file}' .format(old_other_file=old_other_file, new_other_file=new_other_file)) other_file = open(new_other_file, 'w') print("Excluded Category encountered: {cat}". format(cat=cat)) if review_counter % 100 == 0: print("{num}00 reviews processed".format( num=(str(int(review_counter / 100))))) if review_counter >= PROCESS_N_REVIEWS_ONLY: break print("{count} Reviews processed".format(count=review_counter))
save_embeddings_file_path = os.path.join( save_in_dir, entity.replace( "word2vec_keyed_vectors_yelp_", "word2vec_GIGA_Embeddings_yelp_")) try: print("Creating Embeddings..") with open(save_embeddings_file_path, 'w') as efile: # i = 0 print("Created file:{file}".format( file=save_embeddings_file_path)) for word in word_vectors.vocab: lemword = wd(word).lemmatize("a") try: if lemword not in filtered_words: continue rp = word_vectors.get_vector( lemword) write_string = lemword + ' ' + ' '.join( map(str, rp)) efile.write(write_string + "\n") except: print( "{lemword} is not in word2vec vocab" .format(lemword=lemword)) pass print("Embeddings CREATED...")
os.path.join(abs_path, entry)) for rev in reviews: # just to check if the line is not empty or junk if len(rev) > 3: try: rev_string = ujson.loads(rev) except ValueError: print( "Parsing Error,string not valid json: " + rev) continue except: print("Parsing Error, string: " + rev) continue adjectives_list = [ wd(w).lemmatize("a") for w in [ reviews.strip_special_chars(word) for word in rev_string.split(" ") ] if len(w) > 2 ] if adjectives_list: review_docs.append(adjectives_list) rev_count += 1 if rev_count > PROCESS_N_REVIEWS_ONLY_PER_CATEGORY: break else: # We don't break here as os.listdir does not guarantee to walk the files in order continue print("Read {count} documents for {c} category".format( count=str(rev_count), c=yelp_category)) # Now we have all the review docs for the specific category