def make_corpus(target_folder): print(target_folder) corpus_name = target_folder + '.txt' f_wr = open(os.path.join(CONFIG.DATA_PATH, 'corpus', corpus_name), 'w', encoding='utf-8') text_path = os.path.join(CONFIG.DATA_PATH, target_folder) text_folder_list = os.listdir(text_path) count = 0 # languages_dic = dict() for text_folder in text_folder_list: # print("folder: ", text_folder) text_files = os.listdir(os.path.join(text_path, text_folder)) for text_file in text_files: if text_file.endswith( '.txt') and not text_file.endswith('_location.txt'): if count % 100 == 0: print(count) with open(os.path.join(text_path, text_folder, text_file), 'r', encoding='utf-8', newline='\n') as f: # print("file: ", text_file) data = f.read() line = process_text(data) if len(line) > 0: f_wr.write(line + ' <EOS> <PAD>\n') count = count + 1 f_wr.close() # csv_name = target_folder + '_meta.csv' # with open(os.path.join(CONFIG.CSV_PATH, csv_name), 'w', encoding='utf-8-sig', newline='') as f: # w = csv.writer(f) # for k,v in languages_dic.items(): # w.writerow((k, v)) print("completed to make corpus")
def crawler(key, info_tuple, max=5): ''' max: maxmium trails allowed (suggest: 5) ''' queue = [key] path = [] i = 1 dic = {} while len(queue) > 0 and i <= max: key = queue[0] queue.remove(key) path.append(key) web = util.retrieve_web(key, info_tuple) hashtag = util.process_hashtag(web) text = util.process_text(web) queue.extend([ x for x in set(hashtag) if x != key and x not in queue and x not in path ]) dic.update({key: [hashtag, text]}) i += 1 return dic
def get_expenses_for_rows(df, stor_exp_data_path, stor_data_path, budg_path, bankconfig): """ Gets the expense data for stores, prompting the user when multiple expenses exist for a store params: df - pandas dataframe stor_exp_data_path - filepath to expensesDB """ print( "\nIterating your transactions. If you want to quit halfway, type ctrl c to save!\n" ) # initialize the objects for tracking changes exp_stor_db = data_help.read_jsonFile(stor_exp_data_path) stor_db = data_help.read_jsonFile(stor_data_path) budg_db = data_help.read_jsonFile(budg_path) try: for idx, row in df.iterrows(): # iterate through only the data which has no expenses declared. if pd.isnull(row[env.EXPENSE]): # get relevant expenses for that month set by the user. month_end_date = util.get_month_from_timestamp(row[env.DATE], start=False) if type(row[env.BANK_STORENAME]) is str: match = bankconfig.regex_str.search( row[env.BANK_STORENAME]) if match: processed_text = util.process_text(match.group(0)) print( f"Was able to filter - {row[env.BANK_STORENAME]} -> {processed_text}" ) storename = processed_text else: print(f"Unable to filter - {row[env.BANK_STORENAME]}") storename = row[env.BANK_STORENAME] else: # default case use empty str print("No storename exists for this transaction.") storename = "" print( "Curr Transaction: %-10s | %-10s | %-10s | %-10s " % (row[env.DATE], row[env.AMOUNT], storename, row[env.TYPE])) selected_exp, exp_stor_db, stor_db, storename = search_store_relationships( storename, exp_stor_db, budg_db[month_end_date], stor_exp_data_path, stor_db, stor_data_path) df.at[idx, env.FILT_STORENAME] = storename df.at[idx, env.EXPENSE] = selected_exp except KeyboardInterrupt: print( "\n\nQuitting to main menu. Your data inputs will be saved, and you can resume where you left off by restarting and selecting 'v' for view data!\n" ) return df
def process_dataset_text(target_dataset): dataset_path = os.path.join(CONFIG.DATASET_PATH, target_dataset) if not os.path.exists(dataset_path): os.mkdir(dataset_path) df_data = pd.read_csv(os.path.join(CONFIG.TARGET_PATH, 'posts.csv'), encoding='utf-8-sig') print("tokenizing sentences...") pbar = tqdm(total=df_data.shape[0]) shortcode_list = [] word_list_list = [] for index, in_row in df_data.iterrows(): pbar.update(1) if pd.isna(in_row.iloc[2]): continue word_list = process_text(in_row.iloc[2]) if len(word_list) > 0: shortcode_list.append(in_row.iloc[1]) word_list_list.append(word_list) pbar.close() print("counting frequencies...") frequency = {} pbar = tqdm(total=len(word_list_list)) for word_list in word_list_list: pbar.update(1) for word in word_list: count = frequency.get(word, 0) frequency[word] = count + 1 pbar.close() print("convert too few words to UNK token...") pbar = tqdm(total=len(word_list_list)) processed_word_list_list = [] for word_list in word_list_list: pbar.update(1) processed_word_list = [] for word in word_list: if frequency[word] < CONFIG.MIN_WORD_COUNT: processed_word_list.append('UNK') else: processed_word_list.append(word) processed_word_list_list.append(processed_word_list) pbar.close() print("making corpus and csv files...") f_csv = open(os.path.join(dataset_path, 'posts.csv'), 'w', encoding='utf-8-sig') f_corpus = open(os.path.join(dataset_path, 'corpus.txt'), 'w', encoding='utf-8') wr = csv.writer(f_csv) pbar = tqdm(total=len(processed_word_list_list)) for index in range(len(processed_word_list_list)): pbar.update(1) sentence = ' '.join(processed_word_list_list[index]) if len(sentence) > 0: out_row = [] out_row.append(shortcode_list[index]) out_row.append(sentence + ' <EOS>') wr.writerow(out_row) f_corpus.write(sentence + ' <EOS>\n') pbar.close() f_csv.close() f_corpus.close()
te_d = c.fetchall() c.execute("SELECT * FROM validation") va_d = c.fetchall() tr_x = [d[0] for d in tr_d] tr_y_ = [d[2] for d in tr_d] te_x = [d[0] for d in te_d] te_y_ = [d[2] for d in te_d] va_x = [d[0] for d in va_d] va_y_ = [d[2] for d in va_d] print("Done.\n") if CLEAN_TEXT: print("Cleaning data\n----------") ptr_x = [process_text(x, stop_words) for x in tr_x] pte_x = [process_text(x, stop_words) for x in te_x] pva_x = [process_text(x, stop_words) for x in va_x] print("Done.\n") else: ptr_x = tr_x pte_x = te_x pva_x = va_x # Transform text to document-term matrix print("Creating document-term matrix\n---------") cv = CountVectorizer(min_df=2) cv = cv.fit(ptr_x) tr_dtmat = cv.transform(ptr_x) if TF_IDF: tfidf = TfidfTransformer()
def copy_selected_post(target_folder): from util import process_text path_to_posts = {} data_path = os.path.join(CONFIG.DATA_PATH, target_folder) for directory in os.listdir(data_path): path_dir = os.path.join(data_path, directory) path_to_posts[directory] = [] for file in os.listdir(path_dir): if file.endswith('UTC.txt'): path_to_posts[directory].append(file) print("Total # of locations: ", len(path_to_posts)) data_path = os.path.join(CONFIG.DATA_PATH, target_folder) dataset_path = os.path.join(CONFIG.DATASET_PATH, target_folder) if not os.path.exists(dataset_path): os.mkdir(dataset_path) count = 0 for directory, posts in path_to_posts.items(): print(str(count), "th Location directory: ", directory) path_dir = os.path.join(data_path, directory) for file in os.listdir(path_dir): if file.endswith('location.txt'): os.remove(os.path.join(path_dir, file)) continue if not file.endswith('.jpg') and not file.endswith( '.txt') and not file.endswith('.json'): os.remove(os.path.join(path_dir, file)) continue for post in tqdm(posts): post_name = post.replace(".txt", "") post_dic = {"img": [], "text": "", "json": ""} for file in os.listdir(path_dir): if file.startswith(post_name): if file.endswith('.jpg'): post_dic['img'].append(file) elif file.endswith('.json'): post_dic['json'] = file elif file.endswith( '.txt') and not file.endswith('location.txt'): post_dic['text'] = file else: pass if len(post_dic["img"] ) > 0 and post_dic["text"] != "" and post_dic["json"] != "": with open(os.path.join(path_dir, post_dic["text"]), 'r', encoding='utf-8', newline='\n') as f: # print("file: ", text_file) data = f.read() line = process_text(data) if len(line) > 0: path_to_location = os.path.join( dataset_path, directory) if not os.path.exists(path_to_location): os.mkdir(path_to_location) path_to_post = os.path.join(dataset_path, directory, post_name) if not os.path.exists(path_to_post): os.mkdir(path_to_post) shutil.move(os.path.join(path_dir, post_dic["json"]), os.path.join(path_to_post, "meta.json")) os.mkdir(os.path.join(path_to_post, "images")) for idx, img in enumerate(post_dic["img"]): img_name = "image_" + str(idx) + ".jpg" shutil.move( os.path.join(path_dir, img), os.path.join(path_to_post, "images", img_name)) f_wr = open(os.path.join(path_to_post, "text.txt"), 'w', encoding='utf-8') f_wr.write(line + ' <EOS>\n') f_wr.close() f.close() shutil.rmtree(path_dir) count = count + 1 print("Copy completed")
def test(target_dataset): # toy_path = os.path.join(CONFIG.DATASET_PATH, 'instagram0830') # full_data = [] # full_data_norm = [] # for image_path in os.listdir(os.path.join(toy_path, 'resnext101_32x8d')): # with open(os.path.join(toy_path, 'resnext101_32x8d', image_path), "rb") as f: # image_data = cPickle.load(f) # # print(data) # # print(np.max(data)) # # print(np.min(data)) # # print(np.mean(data)) # # print(data.shape) # full_data.append(image_data) # image_data_norm = np.linalg.norm(image_data, axis=1, ord=2) # full_data_norm.append(image_data_norm) # #df_data = pd.read_csv(os.path.join(CONFIG.DATASET_PATH, target_dataset, 'posts.csv'), header=None, encoding='utf-8') # #print(df_data) # full_data = np.array(full_data, dtype=np.float32) # full_data_norm = np.array(full_data_norm, dtype=np.float32) # temp = np.mean(np.mean(full_data, axis=2), axis=1) # print(temp.shape) # print("mean: ", np.mean(np.mean(full_data, axis=2), axis=1)) # print("std: ", np.mean(np.std(full_data, axis=2), axis=1)) # print("max: ", np.mean(np.max(full_data, axis=2), axis=1)) # print("min: ", np.mean(np.min(full_data, axis=2), axis=1)) # print("norm: ", full_data_norm) dataset_path = os.path.join(CONFIG.DATASET_PATH, target_dataset) if not os.path.exists(dataset_path): os.mkdir(dataset_path) with open(os.path.join('./data', 'pickle', 'hotel_reviews.p'), 'rb') as f: dataset = cPickle.load(f, encoding="latin1") f.close() print("tokenizing sentences...") shortcode_list = [] word_list_list = [] pbar = tqdm(total=len(dataset[0])) for pg in dataset[0]: pbar.update(1) data = " ".join([dataset[3][idx] for idx in pg]) data = data.replace("END_TOKEN", "") word_list = process_text(data) if len(word_list) > 0: word_list_list.append(word_list) pbar.close() pbar = tqdm(total=len(dataset[1])) for pg in dataset[1]: pbar.update(1) data = " ".join([dataset[3][idx] for idx in pg]) data = data.replace("END_TOKEN", "") word_list = process_text(data) if len(word_list) > 0: word_list_list.append(word_list) pbar.close() print("making corpus and csv files...") f_csv = open(os.path.join(dataset_path, 'posts.csv'), 'w', encoding='utf-8') f_corpus = open(os.path.join(dataset_path, 'corpus.txt'), 'w', encoding='utf-8') wr = csv.writer(f_csv) pbar = tqdm(total=len(word_list_list)) for word_list in word_list_list: pbar.update(1) sentence = ' '.join(word_list) + ' <EOS>' out_row = [] out_row.append('asd') out_row.append(sentence) wr.writerow(out_row) f_corpus.write(sentence + '\n') pbar.close() f_csv.close() f_corpus.close()
for start, stop in pairwise(path): track = graph.edge[start][stop]['track'] playlist.append((track['name'], track['artists'][0]['name'], track['href'])) return playlist # TODO turn this thing into a webapp :) def _print_playlist(playlist): for track in playlist: print "{!s:<20} by {!s:<30} {!s:<30}".format(*track) # TODO improve text conditioning def spoetify(text): words = input_string.split() graph = _build_graph(words) playlist = _build_playlist(graph, words) if not playlist: raise SystemExit _print_playlist(playlist) if __name__ == '__main__': import fileinput input_parts = [] for line in fileinput.input(): input_parts.append(process_text(line)) input_string = u' '.join(input_parts) spoetify(input_string)