コード例 #1
0
ファイル: tasks.py プロジェクト: JackNeus/bookish-waddle
def get_top_bigrams(files):
    bigram_freqs = partition_map(get_bigrams, [x[::-1] for x in files])
    # Currently, discard any files that couldn't be found.
    bigram_freqs = [x for x in bigram_freqs if x is not None]
    push_metadata_to_db("files_analyzed", force = True)

    # Merge dictionaries.
    years = [x[1] for x in files]

    global_freqs = init_dict(years, defaultdict(lambda: 0, {}))
    total_bigram_counts = init_dict(years, 0)
    metadata = init_dict(years, defaultdict(lambda: 0, {}))
    for year in years:
        metadata[year]["Files Analyzed"] += 1

    for year, freqs, file_size in bigram_freqs:
        total_bigram_counts[year] += file_size ** 2
        for bigram, freq in freqs.items():
            global_freqs[year][bigram] += freq
        metadata[year]["Total Word Count"] += file_size
    for year in global_freqs.keys():
        # Only return bigrams that make up more than .5% of all bigrams for that year.
        #freq_threshold = 0.005 * total_bigram_counts[year]
        #freq_threshold = 25
        #global_freqs[year] = {";".join(k):v for (k,v) in global_freqs[year].items() if v >= freq_threshold}
        global_freqs[year] = n_highest_entries(global_freqs[year], 50)
    
    return [global_freqs, metadata]
コード例 #2
0
ファイル: tasks.py プロジェクト: JackNeus/bookish-waddle
def get_word_family_data(file_data, keywords, in_app = True):
    filename, fileyear = file_data

    # TODO: Determine behavior when a file can't be found.
    try:
        file = open(filename, "r")
    except FileNotFoundError as e:
        return None
    # TODO: Remove call to lower()
    file = list(map(lambda x: x.strip().lower(), file.readlines()))
    
    sigma = 5
    window_size = 4 * sigma
    weights = [math.exp((-x**2)/(2*sigma))/sigma for x in range(0, window_size+1)]

    # Only calculate fcm for keywords that appear in the file.
    keywords = list(filter(lambda x: x in file, keywords))

    # Compute feature co-ocurrence matrix using a 
    # Gaussian weighting of word frequencies.
    fcm = init_dict(keywords, {x: 0 for x in keywords})
    # Also, build a frequency table for the keywords.
    word_freq = init_dict(keywords, 0)
    for i in range(len(file)):
        if file[i] not in keywords:
            continue
        word_freq[file[i]] += 1
        for j in range(max(0, i - window_size), min(len(file), i + window_size + 1)):
            # Don't want to compare word to itself.
            if i == j:
                continue
            if file[j] in keywords:
                fcm[file[i]][file[j]] += weights[abs(i - j)]
                # Avoid double counting if the words are the same.
                if file[i] != file[j]:
                    fcm[file[j]][file[i]] += weights[abs(i - j)]
    if in_app:
        inc_task_processed()
        push_metadata_to_db("files_analyzed")
    return (fileyear, fcm, word_freq, len(file))
コード例 #3
0
ファイル: tasks.py プロジェクト: JackNeus/bookish-waddle
def word_freq(files, keywords):
    if isinstance(keywords, str):
        keywords = [keywords]
    # Remove duplicates and make all keywords lowercase.
    keywords = [keyword.lower() for keyword in set(keywords)]

    word_freqs = get_pool().starmap(get_word_freq, zip(files, repeat(keywords)))
    # Currrently, discard any files that couldn't be found.
    word_freqs = [x for x in word_freqs if x is not None]
    set_task_metadata("files_analyzed", len(word_freqs))

    # Merge dictionaries.
    years = [x[1] for x in files]
    min_year = int(min(years))
    max_year = int(max(years))
    years = range(min_year, max_year+1)
    global_word_freqs = init_dict(keywords, init_dict(years, 0))
    corpus_size = init_dict(years, 0)
    metadata = init_dict(years, defaultdict(lambda: 0, {}))

    for year, freqs, file_size in word_freqs:
        for word, frequency in freqs.items():
            global_word_freqs[word][year] += frequency
        corpus_size[year] += file_size
        metadata[year]["Files Analyzed"] += 1
        metadata[year]["Total Word Count"] += file_size
    
    # Convert absolute count to percentage
    for keyword in global_word_freqs:
        for year in global_word_freqs[keyword]:
            val = global_word_freqs[keyword][year]
            if corpus_size[year] != 0:
                val = val / corpus_size[year] * 100
            global_word_freqs[keyword][year] = float("%.6f" % val)

    return [global_word_freqs, metadata]
コード例 #4
0
ファイル: tasks.py プロジェクト: JackNeus/bookish-waddle
def get_word_freq(file_data, keywords):
    filename, fileyear = file_data
    # TODO: Determine behavior when a file can't be found.
    try:
        file = open(filename, "r")
    except FileNotFoundError as e:
        return None
    # TODO: Handle .lower() here.
    file = list(map(lambda x: x.strip(), file.readlines()))
    freqs = init_dict(keywords, 0)
    for word in file:
        word = word.lower()
        if word in keywords:
            freqs[word] += 1
    inc_task_processed()
    push_metadata_to_db("files_analyzed")
    return (fileyear, freqs, len(file))
コード例 #5
0
ファイル: tasks.py プロジェクト: JackNeus/bookish-waddle
def get_word_family_graph(file_list, word_families, in_app = True):
    keywords = []
    if isinstance(word_families, list):
        temp = {}
        for i in range(len(word_families)):
            temp[i] = word_families[i]
        word_families = temp
    for family in word_families.values():
        keywords = keywords + family
    # Remove stopwords.
    keywords = filter(lambda x: x not in stopwords, keywords)
    # Remove duplicates.
    keywords = list(set(keywords))

    if in_app:
        word_family_data = get_pool().starmap(get_word_family_data, zip(file_list, repeat(keywords)))
    else:
        word_family_data = list(map(lambda x: get_word_family_data(x, keywords, in_app), file_list))

    # Merge dictionaries.
    years = [x[1] for x in file_list]

    empty_fcm = defaultdict(lambda: copy.deepcopy(defaultdict(lambda: 0)))
    fcms = init_dict(years, empty_fcm)
    word_freqs = init_dict(years, defaultdict(lambda: 0, []))
    metadata = init_dict(years, defaultdict(lambda: 0, []))

    # Merge fcms by year.
    for entry in word_family_data:
        if entry is None:
            continue
        year, file_fcm, file_word_freqs, word_count = entry
        metadata[year]["Files Analyzed"] += 1 
        metadata[year]["Total Word Count"] += word_count
        for keyword in file_fcm:
            word_freqs[year][keyword] += file_word_freqs[keyword]
            for word, gfreq in file_fcm[keyword].items():
                fcms[year][keyword][word] += gfreq
    
    # Convert from defaultdicts to dicts.
    fcms = dict(fcms)
    word_freqs = dict(word_freqs)
    metadata = dict(metadata)
    for year in fcms:
        word_freqs[year] = dict(word_freqs[year])
        fcms[year] = dict(fcms[year])
        metadata[year] = dict(metadata[year])
        for keyword in fcms[year]:
            fcms[year][keyword] = dict(fcms[year][keyword])

    # Normalize word freq table to [0, 1].
    for year in word_freqs:
        if len(word_freqs[year]) > 0:
            min_freq = min(word_freqs[year].values())
            max_freq = max(word_freqs[year].values())
            for word, freq in word_freqs[year].items():
                freq_range = max_freq - min_freq
                if max_freq == min_freq:
                    freq_range = 1
                word_freqs[year][word] = (freq - min_freq) / freq_range

    # Adjust weights in fcms
    for year in fcms:
        max_edge_val = 0
        # weight = log(1 + weight)
        for keyword in fcms[year]:
            for word, val in fcms[year][keyword].items():
                fcms[year][keyword][word] = math.log(1+val)
                max_edge_val = max(max_edge_val, fcms[year][keyword][word])
        # normalize so <= 1
        if max_edge_val != 0:
            for keyword in fcms[year]:
                for word, val in fcms[year][keyword].items():
                    fcms[year][keyword][word] = val / max_edge_val

    return [fcms, word_freqs, word_families, metadata]
コード例 #6
0
import util
import config
import stock as s
import json
import tushare as ts
import operator
import pickle

dict = util.init_dict()
count_stock = {}
count_category = {}
count = 1
list = []
filename = "./data/list"

# filehandler = open(filename, 'r')
# list = pickle.load(filehandler)
# print list

for i in dict:
    print "In progress: ", count, "/", len(dict)
    count += 1
    if count == 20:
        break
    try:
        for row in ts.get_hist_data(i, start=config.START,
                                    end=config.END).itertuples():
            stock = s.Stock(i, "N/A", row[0], row[1], row[2], row[3], row[4],
                            row[5], row[6], dict)
            # print row[0], row[1], row[2], row[3], row[4], row[5], row[6]
            list.append(stock)