Esempio n. 1
0
def make_corpus(target_folder):
    print(target_folder)
    corpus_name = target_folder + '.txt'
    f_wr = open(os.path.join(CONFIG.DATA_PATH, 'corpus', corpus_name),
                'w',
                encoding='utf-8')
    text_path = os.path.join(CONFIG.DATA_PATH, target_folder)
    text_folder_list = os.listdir(text_path)
    count = 0
    # languages_dic = dict()
    for text_folder in text_folder_list:
        # print("folder: ", text_folder)
        text_files = os.listdir(os.path.join(text_path, text_folder))
        for text_file in text_files:
            if text_file.endswith(
                    '.txt') and not text_file.endswith('_location.txt'):
                if count % 100 == 0:
                    print(count)
                with open(os.path.join(text_path, text_folder, text_file),
                          'r',
                          encoding='utf-8',
                          newline='\n') as f:
                    # print("file: ", text_file)
                    data = f.read()
                    line = process_text(data)
                    if len(line) > 0:
                        f_wr.write(line + ' <EOS> <PAD>\n')
                count = count + 1
    f_wr.close()
    # csv_name = target_folder + '_meta.csv'
    # with open(os.path.join(CONFIG.CSV_PATH, csv_name), 'w', encoding='utf-8-sig', newline='') as f:
    # 	w = csv.writer(f)
    # 	for k,v in languages_dic.items():
    # 		w.writerow((k, v))
    print("completed to make corpus")
Esempio n. 2
0
def crawler(key, info_tuple, max=5):
    '''
	max: maxmium trails allowed (suggest: 5)
	'''
    queue = [key]
    path = []
    i = 1
    dic = {}

    while len(queue) > 0 and i <= max:
        key = queue[0]
        queue.remove(key)
        path.append(key)
        web = util.retrieve_web(key, info_tuple)
        hashtag = util.process_hashtag(web)
        text = util.process_text(web)
        queue.extend([
            x for x in set(hashtag)
            if x != key and x not in queue and x not in path
        ])
        dic.update({key: [hashtag, text]})

        i += 1

    return dic
Esempio n. 3
0
def get_expenses_for_rows(df, stor_exp_data_path, stor_data_path, budg_path,
                          bankconfig):
    """
    Gets the expense data for stores, prompting the user when multiple expenses exist for a store
    params:
        df - pandas dataframe
        stor_exp_data_path - filepath to expensesDB
    """
    print(
        "\nIterating your transactions. If you want to quit halfway, type ctrl c to save!\n"
    )

    # initialize the objects for tracking changes
    exp_stor_db = data_help.read_jsonFile(stor_exp_data_path)
    stor_db = data_help.read_jsonFile(stor_data_path)
    budg_db = data_help.read_jsonFile(budg_path)
    try:
        for idx, row in df.iterrows():
            # iterate through only the data which has no expenses declared.
            if pd.isnull(row[env.EXPENSE]):
                # get relevant expenses for that month set by the user.
                month_end_date = util.get_month_from_timestamp(row[env.DATE],
                                                               start=False)
                if type(row[env.BANK_STORENAME]) is str:
                    match = bankconfig.regex_str.search(
                        row[env.BANK_STORENAME])

                    if match:

                        processed_text = util.process_text(match.group(0))
                        print(
                            f"Was able to filter - {row[env.BANK_STORENAME]} -> {processed_text}"
                        )
                        storename = processed_text

                    else:
                        print(f"Unable to filter - {row[env.BANK_STORENAME]}")
                        storename = row[env.BANK_STORENAME]

                else:  # default case use empty str
                    print("No storename exists for this transaction.")
                    storename = ""

                print(
                    "Curr Transaction:  %-10s | %-10s | %-10s | %-10s " %
                    (row[env.DATE], row[env.AMOUNT], storename, row[env.TYPE]))
                selected_exp, exp_stor_db, stor_db, storename = search_store_relationships(
                    storename, exp_stor_db, budg_db[month_end_date],
                    stor_exp_data_path, stor_db, stor_data_path)
                df.at[idx, env.FILT_STORENAME] = storename
                df.at[idx, env.EXPENSE] = selected_exp

    except KeyboardInterrupt:
        print(
            "\n\nQuitting to main menu. Your data inputs will be saved, and you can resume where you left off by restarting and selecting 'v' for view data!\n"
        )

    return df
Esempio n. 4
0
def process_dataset_text(target_dataset):
	dataset_path = os.path.join(CONFIG.DATASET_PATH, target_dataset)
	if not os.path.exists(dataset_path):
		os.mkdir(dataset_path)
	df_data = pd.read_csv(os.path.join(CONFIG.TARGET_PATH, 'posts.csv'), encoding='utf-8-sig')
	print("tokenizing sentences...")
	pbar = tqdm(total=df_data.shape[0])
	shortcode_list = []
	word_list_list = []
	for index, in_row in df_data.iterrows():
		pbar.update(1)
		if pd.isna(in_row.iloc[2]):
			continue
		word_list = process_text(in_row.iloc[2])
		if len(word_list) > 0:
			shortcode_list.append(in_row.iloc[1])
			word_list_list.append(word_list)
	pbar.close()
	print("counting frequencies...")
	frequency = {}
	pbar = tqdm(total=len(word_list_list))
	for word_list in word_list_list:
		pbar.update(1)
		for word in word_list:
			count = frequency.get(word, 0)
			frequency[word] = count + 1
	pbar.close()
	print("convert too few words to UNK token...")
	pbar = tqdm(total=len(word_list_list))
	processed_word_list_list = []
	for word_list in word_list_list:
		pbar.update(1)
		processed_word_list = []
		for word in word_list:
			if frequency[word] < CONFIG.MIN_WORD_COUNT:
				processed_word_list.append('UNK')
			else:
				processed_word_list.append(word)
		processed_word_list_list.append(processed_word_list)
	pbar.close()
	print("making corpus and csv files...")
	f_csv = open(os.path.join(dataset_path, 'posts.csv'), 'w', encoding='utf-8-sig')
	f_corpus = open(os.path.join(dataset_path, 'corpus.txt'), 'w', encoding='utf-8')
	wr = csv.writer(f_csv)
	pbar = tqdm(total=len(processed_word_list_list))
	for index in range(len(processed_word_list_list)):
		pbar.update(1)
		sentence = ' '.join(processed_word_list_list[index])
		if len(sentence) > 0:
			out_row = []
			out_row.append(shortcode_list[index])
			out_row.append(sentence + ' <EOS>')
			wr.writerow(out_row)
			f_corpus.write(sentence + ' <EOS>\n')
	pbar.close()
	f_csv.close()
	f_corpus.close()
Esempio n. 5
0
te_d = c.fetchall()
c.execute("SELECT * FROM validation")
va_d = c.fetchall()

tr_x = [d[0] for d in tr_d]
tr_y_ = [d[2] for d in tr_d]
te_x = [d[0] for d in te_d]
te_y_ = [d[2] for d in te_d]
va_x = [d[0] for d in va_d]
va_y_ = [d[2] for d in va_d]

print("Done.\n")

if CLEAN_TEXT:
    print("Cleaning data\n----------")
    ptr_x = [process_text(x, stop_words) for x in tr_x]
    pte_x = [process_text(x, stop_words) for x in te_x]
    pva_x = [process_text(x, stop_words) for x in va_x]
    print("Done.\n")
else:
    ptr_x = tr_x
    pte_x = te_x
    pva_x = va_x

# Transform text to document-term matrix
print("Creating document-term matrix\n---------")
cv = CountVectorizer(min_df=2)
cv = cv.fit(ptr_x)
tr_dtmat = cv.transform(ptr_x)
if TF_IDF:
    tfidf = TfidfTransformer()
Esempio n. 6
0
def copy_selected_post(target_folder):
    from util import process_text
    path_to_posts = {}
    data_path = os.path.join(CONFIG.DATA_PATH, target_folder)

    for directory in os.listdir(data_path):
        path_dir = os.path.join(data_path, directory)
        path_to_posts[directory] = []
        for file in os.listdir(path_dir):
            if file.endswith('UTC.txt'):
                path_to_posts[directory].append(file)

    print("Total # of locations: ", len(path_to_posts))

    data_path = os.path.join(CONFIG.DATA_PATH, target_folder)
    dataset_path = os.path.join(CONFIG.DATASET_PATH, target_folder)
    if not os.path.exists(dataset_path):
        os.mkdir(dataset_path)
    count = 0
    for directory, posts in path_to_posts.items():
        print(str(count), "th Location directory: ", directory)
        path_dir = os.path.join(data_path, directory)

        for file in os.listdir(path_dir):
            if file.endswith('location.txt'):
                os.remove(os.path.join(path_dir, file))
                continue
            if not file.endswith('.jpg') and not file.endswith(
                    '.txt') and not file.endswith('.json'):
                os.remove(os.path.join(path_dir, file))
                continue

        for post in tqdm(posts):
            post_name = post.replace(".txt", "")
            post_dic = {"img": [], "text": "", "json": ""}
            for file in os.listdir(path_dir):
                if file.startswith(post_name):
                    if file.endswith('.jpg'):
                        post_dic['img'].append(file)
                    elif file.endswith('.json'):
                        post_dic['json'] = file
                    elif file.endswith(
                            '.txt') and not file.endswith('location.txt'):
                        post_dic['text'] = file
                    else:
                        pass

            if len(post_dic["img"]
                   ) > 0 and post_dic["text"] != "" and post_dic["json"] != "":

                with open(os.path.join(path_dir, post_dic["text"]),
                          'r',
                          encoding='utf-8',
                          newline='\n') as f:
                    # print("file: ", text_file)
                    data = f.read()
                    line = process_text(data)
                    if len(line) > 0:
                        path_to_location = os.path.join(
                            dataset_path, directory)
                        if not os.path.exists(path_to_location):
                            os.mkdir(path_to_location)
                        path_to_post = os.path.join(dataset_path, directory,
                                                    post_name)
                        if not os.path.exists(path_to_post):
                            os.mkdir(path_to_post)
                        shutil.move(os.path.join(path_dir, post_dic["json"]),
                                    os.path.join(path_to_post, "meta.json"))
                        os.mkdir(os.path.join(path_to_post, "images"))
                        for idx, img in enumerate(post_dic["img"]):
                            img_name = "image_" + str(idx) + ".jpg"
                            shutil.move(
                                os.path.join(path_dir, img),
                                os.path.join(path_to_post, "images", img_name))
                        f_wr = open(os.path.join(path_to_post, "text.txt"),
                                    'w',
                                    encoding='utf-8')
                        f_wr.write(line + ' <EOS>\n')
                        f_wr.close()
                f.close()
        shutil.rmtree(path_dir)
        count = count + 1

    print("Copy completed")
Esempio n. 7
0
def test(target_dataset):
	# toy_path = os.path.join(CONFIG.DATASET_PATH, 'instagram0830')
	# full_data = []
	# full_data_norm = []
	# for image_path in os.listdir(os.path.join(toy_path, 'resnext101_32x8d')):
	# 	with open(os.path.join(toy_path, 'resnext101_32x8d', image_path), "rb") as f:
	# 		image_data = cPickle.load(f)
	# 		# print(data)
	# 		# print(np.max(data))
	# 		# print(np.min(data))
	# 		# print(np.mean(data))
	# 		# print(data.shape)
	# 	full_data.append(image_data)
	# 	image_data_norm = np.linalg.norm(image_data, axis=1, ord=2)
	# 	full_data_norm.append(image_data_norm)
	# #df_data = pd.read_csv(os.path.join(CONFIG.DATASET_PATH, target_dataset, 'posts.csv'), header=None, encoding='utf-8')
	# #print(df_data)

	# full_data = np.array(full_data, dtype=np.float32)
	# full_data_norm = np.array(full_data_norm, dtype=np.float32)
	# temp = np.mean(np.mean(full_data, axis=2), axis=1)
	# print(temp.shape)
	# print("mean: ", np.mean(np.mean(full_data, axis=2), axis=1))
	# print("std: ", np.mean(np.std(full_data, axis=2), axis=1))
	# print("max: ", np.mean(np.max(full_data, axis=2), axis=1))
	# print("min: ", np.mean(np.min(full_data, axis=2), axis=1))
	# print("norm: ", full_data_norm)

	dataset_path = os.path.join(CONFIG.DATASET_PATH, target_dataset)
	if not os.path.exists(dataset_path):
		os.mkdir(dataset_path)
	with open(os.path.join('./data', 'pickle', 'hotel_reviews.p'), 'rb') as f:
		dataset = cPickle.load(f, encoding="latin1")
	f.close()
	print("tokenizing sentences...")
	shortcode_list = []
	word_list_list = []
	pbar = tqdm(total=len(dataset[0]))
	for pg in dataset[0]:
		pbar.update(1)
		data = " ".join([dataset[3][idx] for idx in pg])
		data = data.replace("END_TOKEN", "")
		word_list = process_text(data)
		if len(word_list) > 0:
			word_list_list.append(word_list)
	pbar.close()
	pbar = tqdm(total=len(dataset[1]))
	for pg in dataset[1]:
		pbar.update(1)
		data = " ".join([dataset[3][idx] for idx in pg])
		data = data.replace("END_TOKEN", "")
		word_list = process_text(data)
		if len(word_list) > 0:
			word_list_list.append(word_list)
	pbar.close()
	print("making corpus and csv files...")
	f_csv = open(os.path.join(dataset_path, 'posts.csv'), 'w', encoding='utf-8')
	f_corpus = open(os.path.join(dataset_path, 'corpus.txt'), 'w', encoding='utf-8')
	wr = csv.writer(f_csv)
	pbar = tqdm(total=len(word_list_list))
	for word_list in word_list_list:
		pbar.update(1)
		sentence = ' '.join(word_list) + ' <EOS>'
		out_row = []
		out_row.append('asd')
		out_row.append(sentence)
		wr.writerow(out_row)
		f_corpus.write(sentence + '\n')
	pbar.close()
	f_csv.close()
	f_corpus.close()
Esempio n. 8
0
        for start, stop in pairwise(path):
            track = graph.edge[start][stop]['track']
            playlist.append((track['name'], track['artists'][0]['name'],
                track['href']))
    return playlist


# TODO turn this thing into a webapp :)
def _print_playlist(playlist):
    for track in playlist:
        print "{!s:<20} by {!s:<30} {!s:<30}".format(*track)


# TODO improve text conditioning
def spoetify(text):
    words = input_string.split()
    graph = _build_graph(words)
    playlist = _build_playlist(graph, words)
    if not playlist:
        raise SystemExit
    _print_playlist(playlist)


if __name__ == '__main__':
    import fileinput
    input_parts = []
    for line in fileinput.input():
        input_parts.append(process_text(line))
    input_string = u' '.join(input_parts)
    spoetify(input_string)