def get_top_bigrams(files): bigram_freqs = partition_map(get_bigrams, [x[::-1] for x in files]) # Currently, discard any files that couldn't be found. bigram_freqs = [x for x in bigram_freqs if x is not None] push_metadata_to_db("files_analyzed", force = True) # Merge dictionaries. years = [x[1] for x in files] global_freqs = init_dict(years, defaultdict(lambda: 0, {})) total_bigram_counts = init_dict(years, 0) metadata = init_dict(years, defaultdict(lambda: 0, {})) for year in years: metadata[year]["Files Analyzed"] += 1 for year, freqs, file_size in bigram_freqs: total_bigram_counts[year] += file_size ** 2 for bigram, freq in freqs.items(): global_freqs[year][bigram] += freq metadata[year]["Total Word Count"] += file_size for year in global_freqs.keys(): # Only return bigrams that make up more than .5% of all bigrams for that year. #freq_threshold = 0.005 * total_bigram_counts[year] #freq_threshold = 25 #global_freqs[year] = {";".join(k):v for (k,v) in global_freqs[year].items() if v >= freq_threshold} global_freqs[year] = n_highest_entries(global_freqs[year], 50) return [global_freqs, metadata]
def get_word_family_data(file_data, keywords, in_app = True): filename, fileyear = file_data # TODO: Determine behavior when a file can't be found. try: file = open(filename, "r") except FileNotFoundError as e: return None # TODO: Remove call to lower() file = list(map(lambda x: x.strip().lower(), file.readlines())) sigma = 5 window_size = 4 * sigma weights = [math.exp((-x**2)/(2*sigma))/sigma for x in range(0, window_size+1)] # Only calculate fcm for keywords that appear in the file. keywords = list(filter(lambda x: x in file, keywords)) # Compute feature co-ocurrence matrix using a # Gaussian weighting of word frequencies. fcm = init_dict(keywords, {x: 0 for x in keywords}) # Also, build a frequency table for the keywords. word_freq = init_dict(keywords, 0) for i in range(len(file)): if file[i] not in keywords: continue word_freq[file[i]] += 1 for j in range(max(0, i - window_size), min(len(file), i + window_size + 1)): # Don't want to compare word to itself. if i == j: continue if file[j] in keywords: fcm[file[i]][file[j]] += weights[abs(i - j)] # Avoid double counting if the words are the same. if file[i] != file[j]: fcm[file[j]][file[i]] += weights[abs(i - j)] if in_app: inc_task_processed() push_metadata_to_db("files_analyzed") return (fileyear, fcm, word_freq, len(file))
def word_freq(files, keywords): if isinstance(keywords, str): keywords = [keywords] # Remove duplicates and make all keywords lowercase. keywords = [keyword.lower() for keyword in set(keywords)] word_freqs = get_pool().starmap(get_word_freq, zip(files, repeat(keywords))) # Currrently, discard any files that couldn't be found. word_freqs = [x for x in word_freqs if x is not None] set_task_metadata("files_analyzed", len(word_freqs)) # Merge dictionaries. years = [x[1] for x in files] min_year = int(min(years)) max_year = int(max(years)) years = range(min_year, max_year+1) global_word_freqs = init_dict(keywords, init_dict(years, 0)) corpus_size = init_dict(years, 0) metadata = init_dict(years, defaultdict(lambda: 0, {})) for year, freqs, file_size in word_freqs: for word, frequency in freqs.items(): global_word_freqs[word][year] += frequency corpus_size[year] += file_size metadata[year]["Files Analyzed"] += 1 metadata[year]["Total Word Count"] += file_size # Convert absolute count to percentage for keyword in global_word_freqs: for year in global_word_freqs[keyword]: val = global_word_freqs[keyword][year] if corpus_size[year] != 0: val = val / corpus_size[year] * 100 global_word_freqs[keyword][year] = float("%.6f" % val) return [global_word_freqs, metadata]
def get_word_freq(file_data, keywords): filename, fileyear = file_data # TODO: Determine behavior when a file can't be found. try: file = open(filename, "r") except FileNotFoundError as e: return None # TODO: Handle .lower() here. file = list(map(lambda x: x.strip(), file.readlines())) freqs = init_dict(keywords, 0) for word in file: word = word.lower() if word in keywords: freqs[word] += 1 inc_task_processed() push_metadata_to_db("files_analyzed") return (fileyear, freqs, len(file))
def get_word_family_graph(file_list, word_families, in_app = True): keywords = [] if isinstance(word_families, list): temp = {} for i in range(len(word_families)): temp[i] = word_families[i] word_families = temp for family in word_families.values(): keywords = keywords + family # Remove stopwords. keywords = filter(lambda x: x not in stopwords, keywords) # Remove duplicates. keywords = list(set(keywords)) if in_app: word_family_data = get_pool().starmap(get_word_family_data, zip(file_list, repeat(keywords))) else: word_family_data = list(map(lambda x: get_word_family_data(x, keywords, in_app), file_list)) # Merge dictionaries. years = [x[1] for x in file_list] empty_fcm = defaultdict(lambda: copy.deepcopy(defaultdict(lambda: 0))) fcms = init_dict(years, empty_fcm) word_freqs = init_dict(years, defaultdict(lambda: 0, [])) metadata = init_dict(years, defaultdict(lambda: 0, [])) # Merge fcms by year. for entry in word_family_data: if entry is None: continue year, file_fcm, file_word_freqs, word_count = entry metadata[year]["Files Analyzed"] += 1 metadata[year]["Total Word Count"] += word_count for keyword in file_fcm: word_freqs[year][keyword] += file_word_freqs[keyword] for word, gfreq in file_fcm[keyword].items(): fcms[year][keyword][word] += gfreq # Convert from defaultdicts to dicts. fcms = dict(fcms) word_freqs = dict(word_freqs) metadata = dict(metadata) for year in fcms: word_freqs[year] = dict(word_freqs[year]) fcms[year] = dict(fcms[year]) metadata[year] = dict(metadata[year]) for keyword in fcms[year]: fcms[year][keyword] = dict(fcms[year][keyword]) # Normalize word freq table to [0, 1]. for year in word_freqs: if len(word_freqs[year]) > 0: min_freq = min(word_freqs[year].values()) max_freq = max(word_freqs[year].values()) for word, freq in word_freqs[year].items(): freq_range = max_freq - min_freq if max_freq == min_freq: freq_range = 1 word_freqs[year][word] = (freq - min_freq) / freq_range # Adjust weights in fcms for year in fcms: max_edge_val = 0 # weight = log(1 + weight) for keyword in fcms[year]: for word, val in fcms[year][keyword].items(): fcms[year][keyword][word] = math.log(1+val) max_edge_val = max(max_edge_val, fcms[year][keyword][word]) # normalize so <= 1 if max_edge_val != 0: for keyword in fcms[year]: for word, val in fcms[year][keyword].items(): fcms[year][keyword][word] = val / max_edge_val return [fcms, word_freqs, word_families, metadata]
import util import config import stock as s import json import tushare as ts import operator import pickle dict = util.init_dict() count_stock = {} count_category = {} count = 1 list = [] filename = "./data/list" # filehandler = open(filename, 'r') # list = pickle.load(filehandler) # print list for i in dict: print "In progress: ", count, "/", len(dict) count += 1 if count == 20: break try: for row in ts.get_hist_data(i, start=config.START, end=config.END).itertuples(): stock = s.Stock(i, "N/A", row[0], row[1], row[2], row[3], row[4], row[5], row[6], dict) # print row[0], row[1], row[2], row[3], row[4], row[5], row[6] list.append(stock)