def get_month_after_limited(percent_change_file): econ_seq = load_percent_change(percent_change_file) sorted_econ = sorted(econ_seq, key=econ_seq.get) for x in sorted_econ: print (x, econ_seq[x]) # take top 10% as good months and bottom 10% as bad months num = int(len(econ_seq) / 6) prev_bad = sorted_econ[:num] prev_good = sorted_econ[-num:] prev_good = reversed(prev_good) def plus_month(seq): next_month = timedelta(days=31) new_seq = [] year_to_count = defaultdict(int) for x in seq: if year_to_count[x.year] >= 2: continue year_to_count[x.year] += 1 new_date = x + next_month floored_date = date(new_date.year, new_date.month, 1) new_seq.append(floored_date) return new_seq return plus_month(prev_good), plus_month(prev_bad)
def get_same_month(percent_change_file): econ_seq = load_percent_change(percent_change_file) sorted_econ = sorted(econ_seq, key=econ_seq.get) # take top 10% as good months and bottom 10% as bad months num = int(len(econ_seq) / 6) prev_bad = sorted_econ[:num] prev_good = sorted_econ[-num:] return prev_good, prev_bad
def get_month_just_after(percent_change_file): econ_seq = load_percent_change(percent_change_file) sorted_econ = sorted(econ_seq, key=econ_seq.get) # take top 10% as good months and bottom 10% as bad months num = int(len(econ_seq) / 6) prev_bad = sorted_econ[:num] prev_good = sorted_econ[-num:] def plus_month(seq): next_month = timedelta(days=31) new_seq = [] for x in seq: new_date = x + next_month floored_date = date(new_date.year, new_date.month, 1) new_seq.append(floored_date) return new_seq return plus_month(prev_good), plus_month(prev_bad)
def get_good_month_prev(percent_change_file, percent=6): econ_seq = load_percent_change(percent_change_file) sorted_econ = sorted(econ_seq, key=econ_seq.get) # take top 10% as bad months and following 10% as good months num = int(len(econ_seq) / percent) prev_good = sorted_econ[-num:] def plus_month(seq): next_month = timedelta(days=31) new_seq = [] for x in seq: new_date = x + next_month floored_date = date(new_date.year, new_date.month, 1) new_seq.append(floored_date) return new_seq # take month of upturn as "bad" and next month as "good" return plus_month(prev_good), prev_good # take month of upturn as "good" and previous month as bad return prev_good, minus_month(prev_good)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--percent_change", default= "/usr1/home/anjalief/corpora/russian/percent_change/russian_rtsi_rub.csv" ) parser.add_argument( "--input_path", default="/usr1/home/anjalief/corpora/russian/yearly_mod_subs/iz_lower/" ) parser.add_argument("--keywords", default="./keywords.txt") args = parser.parse_args() # DANGER DANGER DANGER keywords = [l.strip() for l in open(args.keywords).readlines()] econ_dict = load_percent_change(args.percent_change) econ_seq = [econ_dict[d] for d in sorted(econ_dict)] prior = LoadBackgroundCorpus(args.input_path) frame_to_lex = pickle.load(open("frame_to_lex.pickle", "rb")) date_seq, filenames = get_files_by_time_slice(args.input_path, "monthly") prev_e = None prev_i = None frame_to_seq_e = defaultdict(list) frame_to_seq_i = defaultdict(list) for d, filename in zip(date_seq, filenames): curr_e, curr_i, _, _, _ = LoadCountsExternal(filename, keywords) if not prev_e: prev_e = curr_e prev_i = curr_i continue delta_e = write_log_odds(prev_e, curr_e, prior) delta_i = write_log_odds(prev_i, curr_i, prior) for c in frame_to_lex: summary_e = 0 summary_i = 0 for word in frame_to_lex[c]: if word in delta_e: summary_e += delta_e[word] else: print("E Skipping ", word) if word in delta_i: summary_i += delta_i[word] else: print("I Skipping ", word) frame_to_seq_e[c].append(summary_e) frame_to_seq_i[c].append(summary_i) prev_e = curr_e prev_i = curr_i # Done processing files print( "-------------------------------------------------------------------------------" ) print("EXTERNAL") print( "-------------------------------------------------------------------------------" ) for c in frame_to_seq_e: print(c) print_stats(econ_seq, frame_to_seq_e[c]) print( "******************************************************************************" ) print( "-------------------------------------------------------------------------------" ) print("INTERNAL") print( "-------------------------------------------------------------------------------" ) for c in frame_to_seq_i: print(c) print_stats(econ_seq, frame_to_seq_i[c]) print( "******************************************************************************" )