def ngram_volume(volume_path): counts = defaultdict( # year lambda: defaultdict( # ngram_len lambda: defaultdict( # ngram int # count ))) for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) tokens = tokenize_text(get_case_text(pq)) history = [] case_year = get_decision_date(pq).year for i, item in enumerate(tokens): history.append(item) for ngram_len in [1,2,3]: if len(history) >= ngram_len: counts[case_year][ngram_len]["\t".join(history[-ngram_len:])] += 1 if i >= 2: del history[0] for year, ngram_lens in counts.iteritems(): out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1), str(year)) makedirs(out_dir) for ngram_len, data in ngram_lens.items(): with open(os.path.join(out_dir, "%s.json" % ngram_len), 'w') as out: json.dump(data, out)
def search_volumes(): makedirs(dest_dir) for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))): series_name = os.path.basename(series_path) known_series = defaultdict( lambda: { 'count': 0, 'examples': [] } ) for volume_path in sorted(glob(os.path.join(series_path, "*"))): for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) text = get_case_text(pq) cites = cite_match.findall(text) for series in cites: ks = known_series[series[1]] ks['count'] += 1 if len(ks['examples']) < 3: ks['examples'].append(" ".join(series)) # write to CSV out = [[k, v['count']]+v['examples'] for k, v in known_series.iteritems()] out.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(dest_dir, '%s.csv' % series_name), 'wb') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(['Series', 'Count', 'Example 1', 'Example 2', 'Example 3']) for row in out: csvwriter.writerow(row)
def search_volumes(): makedirs(dest_dir) for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))): series_name = os.path.basename(series_path) known_series = defaultdict(lambda: {'count': 0, 'examples': []}) for volume_path in sorted(glob(os.path.join(series_path, "*"))): for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) text = get_case_text(pq) cites = cite_match.findall(text) for series in cites: ks = known_series[series[1]] ks['count'] += 1 if len(ks['examples']) < 3: ks['examples'].append(" ".join(series)) # write to CSV out = [[k, v['count']] + v['examples'] for k, v in known_series.iteritems()] out.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(dest_dir, '%s.csv' % series_name), 'wb') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow( ['Series', 'Count', 'Example 1', 'Example 2', 'Example 3']) for row in out: csvwriter.writerow(row)
def ngram_volume(volume_path): counts = defaultdict( # year lambda: defaultdict( # ngram_len lambda: defaultdict( # ngram int # count ))) for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) tokens = tokenize_text(get_case_text(pq)) history = [] case_year = get_decision_date(pq).year for i, item in enumerate(tokens): history.append(item) for ngram_len in [1, 2, 3]: if len(history) >= ngram_len: counts[case_year][ngram_len]["\t".join( history[-ngram_len:])] += 1 if i >= 2: del history[0] for year, ngram_lens in counts.iteritems(): out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1), str(year)) makedirs(out_dir) for ngram_len, data in ngram_lens.items(): with open(os.path.join(out_dir, "%s.json" % ngram_len), 'w') as out: json.dump(data, out)
def wordcloud_all_jurisdictions(): for jurisdiction_path in glob(os.path.join(source_dir, "Illinois")): print "Processing", jurisdiction_path out_dir = jurisdiction_path.replace(source_dir, dest_dir, 1) top_words_out_dir = jurisdiction_path.replace(source_dir, top_words_dir, 1) makedirs(out_dir) makedirs(top_words_out_dir) # write global global_freqs = process_word_dict(json.load(open(os.path.join(jurisdiction_path, 'totals/1.json')))) save_wordcloud(global_freqs, os.path.join(out_dir, 'totals.png')) # load year data bare_top_words_by_year = {} for year_path in tqdm(glob(os.path.join(jurisdiction_path, "*"))): year = os.path.basename(year_path) if year == 'totals': continue year_freqs = process_word_dict(json.load(open(os.path.join(year_path, '1.json')))) # skip years with few cases (probably typos) if sum(w[1] for w in year_freqs)<10000: continue bare_top_words_by_year[year] = [w[0] for w in year_freqs] # calculate global rankings word_to_ranking = defaultdict(lambda: 0) for year, words in tqdm(bare_top_words_by_year.iteritems()): for pos, word in enumerate(words): word_to_ranking[word] += (1000 - pos) average_rank = dict([(word, ranking/len(bare_top_words_by_year)) for word, ranking in word_to_ranking.iteritems()]) # write average ranks CSV save_top_list(['word','average_rank'], [[word, 1000-rank] for word, rank in average_rank.iteritems()], os.path.join(top_words_out_dir, 'average_ranks.csv')) # calculate scores by year year_to_volatile_words = {} for year, words in tqdm(bare_top_words_by_year.iteritems()): word_to_ranking_delta = {} for pos, word in enumerate(words): rank_for_year = 1000 - pos word_to_ranking_delta[word] = (rank_for_year, average_rank[word], rank_for_year - average_rank[word]) year_to_volatile_words[year] = sorted(word_to_ranking_delta.items(), key=lambda x: x[1][2], reverse=True) # wordclouds for year, words in tqdm(year_to_volatile_words.iteritems()): freqs = [(w[0], w[1][2]) for w in words[:200]] save_wordcloud(freqs, os.path.join(out_dir, '%s.png' % year)) # write year ranks CSV save_top_list(['word', 'absolute_rank', 'relative_score'], [(w[0], 1000-w[1][0], w[1][2]) for w in words], os.path.join(top_words_out_dir, '%s_ranks.csv' % year))
def bam_sort(bam_filename, sorted_bam_filename, tempdir, mem="2G", **kwargs): if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'SortSam', 'INPUT=' + bam_filename, 'OUTPUT=' + sorted_bam_filename, 'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def tokenize_case(case_xml_path): out_path = case_xml_path.replace(source_dir, dest_dir, 1).replace('.xml', '.txt') if os.path.exists(out_path): return pq = parse_file(case_xml_path) case_text = get_case_text(pq) tokens = nltk.word_tokenize(case_text) makedirs(os.path.dirname(out_path)) with open(out_path, 'w') as out: out.write(u"\n".join(tokens).encode("utf8"))
def search_front_matter(): for jurisdiction_path in sorted(glob(os.path.join(source_dir, "*"))): makedirs(jurisdiction_path.replace(source_dir, dest_dir)) for series_path in glob(os.path.join(jurisdiction_path, "*")): print series_path try: out = u"" for volume_path in sorted(glob(os.path.join(series_path, "*")), key=lambda x: int(x.rsplit('/',1)[1])): # load first case in volume case_paths = sorted(glob(os.path.join(volume_path, "*.xml"))) if not case_paths: continue first_case_path = case_paths[0] pq = parse_file(first_case_path) # stop processing volume after 1923 year = get_decision_date(pq).year if year > 1923: break # get first alto file for first case first_case_alto_file = pq('METS|fileGrp[USE="alto"] METS|FLocat')[0].attrib[qn("xlink|href")][3:] first_case_alto_name = os.path.basename(first_case_alto_file) # get directory for alto files for volume case_id = pq("case|case").attr('caseid') alto_dir = os.path.dirname(os.path.join(raw_source_dir, case_id, first_case_alto_file)).replace('_0001', '_redacted') # process alto files until we hit the one for the first case in the volume for alto_path in sorted(glob(os.path.join(alto_dir, "*"))): if alto_path.endswith(first_case_alto_name): break # only bother parsing XML if we find 'reporter' in the text of the alto file somewhere alto_data = open(alto_path).read() if 'reporter' not in alto_data.lower(): continue alto_pq = PyQuery(alto_data, parser='xml', namespaces=namespaces) # extract OCR'd text from alto XML alto_text = " ".join(x.attrib["CONTENT"] for x in alto_pq('alto|String')) # if page has more than fifty lowercase words, less than 15 uppercase words (usually a list of judges), # and less than 30 periods (usually a table of contents), print citation and page text if len(re.findall(r'\b[a-z]+\b', alto_text))>50 and len(re.findall(r'\b[A-Z][A-Z]+\b', alto_text))<15 and len(re.findall(r'\.', alto_text))<30: volume_cite = pq('case|citation[category="official"]').text().rsplit(" ",1)[0] out += "%s\n%s\n%s\n\n" % (alto_path, volume_cite, alto_text) # write out all matched pages for series if out: open((series_path.replace(source_dir, dest_dir) + ".txt").replace('..', '.'), "w").write(out.encode('utf8')) except Exception as e: print "Skipping -- %s" % e
def sort_case(case_xml_path): pq = parse_file(case_xml_path) jurisdiction = get_jurisdiction(pq) citation = get_citation(pq) cite_parts = citation.split(" ") volume, reporter, page_number = cite_parts[0], " ".join(cite_parts[1:-1]), cite_parts[-1] volume_dir = os.path.join(jurisdiction, reporter, volume) makedirs(volume_dir) dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path)) if os.path.exists(dest_path): os.remove(dest_path) os.link(case_xml_path, dest_path)
def sort_case(case_xml_path): pq = parse_file(case_xml_path) jurisdiction = get_jurisdiction(pq) citation = get_citation(pq) cite_parts = citation.split(" ") volume, reporter, page_number = cite_parts[0], " ".join( cite_parts[1:-1]), cite_parts[-1] volume_dir = os.path.join(jurisdiction, reporter, volume) makedirs(volume_dir) dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path)) if os.path.exists(dest_path): os.remove(dest_path) os.link(case_xml_path, dest_path)
def bam_markdups(bam_filename, markduped_bam_filename, metrics_filename, tempdir, mem="2G", **kwargs): if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'MarkDuplicates', 'INPUT=' + bam_filename, 'OUTPUT=' + markduped_bam_filename, 'METRICS_FILE=' + metrics_filename, 'REMOVE_DUPLICATES=False', 'ASSUME_SORTED=True', 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def merge_limerick_lines(): merged = defaultdict( # line_type lambda: defaultdict( # emphasis_pattern lambda: defaultdict( # last_syllable lambda: defaultdict( # last_token list # sentence )))) for path in tqdm(glob(os.path.join(source_dir, "*"))): line_types = json.load(open(path)) for line_type, emphasis_patterns in line_types.iteritems(): for emphasis_pattern, last_syllables in emphasis_patterns.iteritems( ): for last_syllable, last_tokens in last_syllables.iteritems(): for last_token, lines in last_tokens.iteritems(): merged[line_type][emphasis_pattern][last_syllable][ last_token.lower()].extend(lines) filtered = defaultdict( # line_type lambda: defaultdict( # emphasis_pattern lambda: defaultdict( # last_syllable lambda: dict))) for line_type, emphasis_patterns in merged.iteritems(): for emphasis_pattern, last_syllables in emphasis_patterns.iteritems(): # skip long lines that don't include '1**1**1' if line_type == 'long' and '1**1**1' not in emphasis_pattern: # len(emphasis_pattern)<5: continue # skip short lines that are too short if line_type == 'short' and len(emphasis_pattern) < 4: continue for last_syllable, last_tokens in last_syllables.iteritems(): # skip groups with insufficient options if (line_type == 'long' and len(last_tokens) < 3) or ( line_type == 'short' and len(last_tokens) < 2): continue filtered[line_type][emphasis_pattern][ last_syllable] = last_tokens makedirs(dest_dir) json.dump(filtered, open(os.path.join(dest_dir, 'limerick_lines.json'), 'w'))
def bam_collect_gc_metrics(bam_filename, ref_genome, metrics_filename, summary_filename, chart_filename, tempdir, mem="2G", **kwargs): if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectGcBiasMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'S=' + summary_filename, 'CHART_OUTPUT=' + chart_filename, 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def sort_all_volumes(): # make everything in a temp dir makedirs(tmp_dest_dir) os.chdir(tmp_dest_dir) dirs = sorted(glob(os.path.join(source_dir, "from_vendor/*"))) for i, volume_path in enumerate(tqdm(dirs)): # skip dirs that are superceded by the following version base_name = volume_path.split('_redacted_',1)[0] if i < len(dirs)-1 and dirs[i+1].startswith(base_name): #print "Skipping", volume_path continue sort_volume(volume_path) # swap temp dir in place of existing dir shutil.rmtree(dest_dir) os.rename(tmp_dest_dir, dest_dir)
def sort_all_volumes(): # make everything in a temp dir makedirs(tmp_dest_dir) os.chdir(tmp_dest_dir) dirs = sorted(glob(os.path.join(source_dir, "from_vendor/*"))) for i, volume_path in enumerate(tqdm(dirs)): # skip dirs that are superceded by the following version base_name = volume_path.split('_redacted_', 1)[0] if i < len(dirs) - 1 and dirs[i + 1].startswith(base_name): #print "Skipping", volume_path continue sort_volume(volume_path) # swap temp dir in place of existing dir shutil.rmtree(dest_dir) os.rename(tmp_dest_dir, dest_dir)
def merge_limerick_lines(): merged = defaultdict( # line_type lambda: defaultdict( # emphasis_pattern lambda: defaultdict( # last_syllable lambda: defaultdict( # last_token list # sentence )))) for path in tqdm(glob(os.path.join(source_dir, "*"))): line_types = json.load(open(path)) for line_type, emphasis_patterns in line_types.iteritems(): for emphasis_pattern, last_syllables in emphasis_patterns.iteritems(): for last_syllable, last_tokens in last_syllables.iteritems(): for last_token, lines in last_tokens.iteritems(): merged[line_type][emphasis_pattern][last_syllable][last_token.lower()].extend(lines) filtered = defaultdict( # line_type lambda: defaultdict( # emphasis_pattern lambda: defaultdict( # last_syllable lambda: dict))) for line_type, emphasis_patterns in merged.iteritems(): for emphasis_pattern, last_syllables in emphasis_patterns.iteritems(): # skip long lines that don't include '1**1**1' if line_type=='long' and '1**1**1' not in emphasis_pattern: # len(emphasis_pattern)<5: continue # skip short lines that are too short if line_type=='short' and len(emphasis_pattern)<4: continue for last_syllable, last_tokens in last_syllables.iteritems(): # skip groups with insufficient options if (line_type=='long' and len(last_tokens)<3) or (line_type=='short' and len(last_tokens)<2): continue filtered[line_type][emphasis_pattern][last_syllable] = last_tokens makedirs(dest_dir) json.dump(filtered, open(os.path.join(dest_dir, 'limerick_lines.json'), 'w'))
def bam_collect_wgs_metrics(bam_filename, ref_genome, metrics_filename, config, tempdir, mem="2G", **kwargs): if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectWgsMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'MINIMUM_BASE_QUALITY=' + str(config['picard_wgs_params']['min_bqual']), 'MINIMUM_MAPPING_QUALITY=' + str(config['picard_wgs_params']['min_mqual']), 'COVERAGE_CAP=500', 'VALIDATION_STRINGENCY=LENIENT', 'COUNT_UNPAIRED=' + ('True' if config['picard_wgs_params']['count_unpaired'] else 'False'), 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def bam_collect_insert_metrics(bam_filename, flagstat_metrics_filename, metrics_filename, histogram_filename, tempdir, mem="2G", **kwargs): # Check if any paired reads exist has_paired = None with open(flagstat_metrics_filename) as f: for line in f: if 'properly paired' in line: if line.startswith('0 '): has_paired = False else: has_paired = True if has_paired is None: raise Exception( 'Unable to determine number of properly paired reads from {}'. format(flagstat_metrics_filename)) if not has_paired: with open(metrics_filename, 'w') as f: f.write('## FAILED: No properly paired reads\n') with open(histogram_filename, 'w'): pass return if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectInsertSizeMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'HISTOGRAM_FILE=' + histogram_filename, 'ASSUME_SORTED=True', 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def produce_fastqc_report(fastq_filename, output_html, output_plots, temp_dir, **kwargs): makedirs(temp_dir) pypeliner.commandline.execute('fastqc', '--outdir=' + temp_dir, fastq_filename, **kwargs) fastq_basename = os.path.basename(fastq_filename) if fastq_basename.endswith(".fastq.gz"): fastq_basename = fastq_basename.replace(".fastq.gz", "") elif fastq_basename.endswith(".fq.gz"): fastq_basename = fastq_basename.replace(".fq.gz", "") elif fastq_basename.endswith(".fq"): fastq_basename = fastq_basename.replace(".fq", "") elif fastq_basename.endswith(".fastq"): fastq_basename = fastq_basename.replace(".fastq", "") else: raise Exception("Unknown file type") output_basename = os.path.join(temp_dir, fastq_basename) shutil.move(output_basename + '_fastqc.zip', output_plots) shutil.move(output_basename + '_fastqc.html', output_html)
def save_counts(counts, jurisdiction_path, subdir): out_dir = os.path.join(jurisdiction_path.replace(source_dir, dest_dir, 1), str(subdir)) makedirs(out_dir) for ngram_length, data in counts.iteritems(): with open(os.path.join(out_dir, "%s.json" % ngram_length), 'w') as out: json.dump(data, out)
"Test File name = {}, Preprocessed Test input size {}, Test Input size before preprocessing = {}" .format(testfile, test_input.size(), mean_norm_data_df.shape[0])) #HYPERPARAMETERS 2. NUM_INSTANCES = data_df.shape[0] BATCHSIZE = int(args.batchsize) BATCHSIZE = min(NUM_INSTANCES, BATCHSIZE) NUM_BATCHES = int(NUM_INSTANCES / BATCHSIZE) root.info("Results output dir path: {}".format(RESULTS_OUTPUT_DIR_NAME)) root.info("Sequence Length = {}, TOTAL NUMBER OF INSTANCES = {}".format( ENCODER_SEQUENCE_LENGTH, NUM_INSTANCES)) #Check whether figure directories exist. If not create them. TESTOUTPUTDIR = RESULTS_OUTPUT_DIR_NAME + "results_test/" + os.path.basename( testfile).split(".csv")[0] makedirs(TESTOUTPUTDIR) TESTOUTPUTFILE = TESTOUTPUTDIR + "/" + "test_mse.txt" root.info("Results output dir path: {}".format(RESULTS_OUTPUT_DIR_NAME)) root.info("Sequence Length = {}, TOTAL NUMBER OF INSTANCES = {}".format( ENCODER_SEQUENCE_LENGTH, NUM_INSTANCES)) #Instantiate Attention Encoder encoder = EncoderHierAttn(INPUT_SIZE, HIDDEN_SIZE, ENCODER_SEQUENCE_LENGTH, NUM_LAYERS, args.rnnobject, root) encoder.load_state_dict(torch.load(MODEL_OUTPUT_DIR + "encoder.pt")) encoder.eval() encoder.cuda() ##CUDA #Instantiate Attention Decoder decoder = DecoderHierAttn(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, OUTPUT_SIZE,
def make_output_dir(key): output_dir = os.path.join(dest_dir, key) makedirs(output_dir) return output_dir
def write_lines(): out_path = jurisdiction_path.replace(source_dir, dest_dir, 1).rstrip('/')+'.json' makedirs(os.path.dirname(out_path)) with open(out_path, 'w') as out: json.dump(lines, out)
BATCH_SIZE = int(args.batchsize) DROPOUT = float(args.dropout) SLIDING_ATTENTION = json.loads(args.slidingattention.lower()) HIERARCHICAL_ATTN_METHOD = args.hierattnmethod SLIDING_WINSIZE = int(args.slidingwindowsize) # CREATE RESULTS_OUTPUT_DIR_NAME = args.predictionsoutputdir + "/" + "SEQUENCE_LENGTH_{}_NUMLAYERS_{}_HIDDEN_SIZE_{}_DROPOUT_{}_TEACHERFORCING_{}_RNNOBJECT_{}_ITERNUM_{}".format( ENCODER_SEQUENCE_LENGTH, NUM_LAYERS, HIDDEN_SIZE, DROPOUT, args.teacherforcing, args.rnnobject, args.iternum) + "/" print("Results Output Dir Name = {}".format(RESULTS_OUTPUT_DIR_NAME)) MODEL_OUTPUT_DIR = RESULTS_OUTPUT_DIR_NAME + "model/" VALIDATION_OUTPUT_FILE = RESULTS_OUTPUT_DIR_NAME + "validation_mse.txt" #Create figures train and validation directories. print("Model_output_dir = {}".format(MODEL_OUTPUT_DIR)) makedirs(MODEL_OUTPUT_DIR) makedirs(RESULTS_OUTPUT_DIR_NAME + "figures_train") makedirs(RESULTS_OUTPUT_DIR_NAME + "figures_validation") root.info("Results output dir path: {}".format(RESULTS_OUTPUT_DIR_NAME)) """ PREPROCESS TIMESERIES DATA. If you want to run the model on a new dataset and select a sub-set of columns, add a condition to the following set of conditional statements `elif "newdatasetsubstring" in args.datasetname: ...Do Something...` """ if "tep" in args.datasetname: #columnsofinterest=['MEAS_A_Feed', 'MEAS_D_Feed', 'MEAS_E_Feed', 'MEAS_A_C Feed','MEAS_Recycle_flow', 'MEAS_Reactor_feed', 'MEAS_Reactor_pressure','MEAS_Reactor_level', 'MEAS_Reactor_temperature', 'MEAS_Purge_rate'] columnsofinterest = [
def wordcloud_all_jurisdictions(): for jurisdiction_path in glob(os.path.join(source_dir, "Illinois")): print "Processing", jurisdiction_path out_dir = jurisdiction_path.replace(source_dir, dest_dir, 1) top_words_out_dir = jurisdiction_path.replace(source_dir, top_words_dir, 1) makedirs(out_dir) makedirs(top_words_out_dir) # write global global_freqs = process_word_dict( json.load(open(os.path.join(jurisdiction_path, 'totals/1.json')))) save_wordcloud(global_freqs, os.path.join(out_dir, 'totals.png')) # load year data bare_top_words_by_year = {} for year_path in tqdm(glob(os.path.join(jurisdiction_path, "*"))): year = os.path.basename(year_path) if year == 'totals': continue year_freqs = process_word_dict( json.load(open(os.path.join(year_path, '1.json')))) # skip years with few cases (probably typos) if sum(w[1] for w in year_freqs) < 10000: continue bare_top_words_by_year[year] = [w[0] for w in year_freqs] # calculate global rankings word_to_ranking = defaultdict(lambda: 0) for year, words in tqdm(bare_top_words_by_year.iteritems()): for pos, word in enumerate(words): word_to_ranking[word] += (1000 - pos) average_rank = dict([(word, ranking / len(bare_top_words_by_year)) for word, ranking in word_to_ranking.iteritems()]) # write average ranks CSV save_top_list(['word', 'average_rank'], [[word, 1000 - rank] for word, rank in average_rank.iteritems()], os.path.join(top_words_out_dir, 'average_ranks.csv')) # calculate scores by year year_to_volatile_words = {} for year, words in tqdm(bare_top_words_by_year.iteritems()): word_to_ranking_delta = {} for pos, word in enumerate(words): rank_for_year = 1000 - pos word_to_ranking_delta[word] = (rank_for_year, average_rank[word], rank_for_year - average_rank[word]) year_to_volatile_words[year] = sorted( word_to_ranking_delta.items(), key=lambda x: x[1][2], reverse=True) # wordclouds for year, words in tqdm(year_to_volatile_words.iteritems()): freqs = [(w[0], w[1][2]) for w in words[:200]] save_wordcloud(freqs, os.path.join(out_dir, '%s.png' % year)) # write year ranks CSV save_top_list(['word', 'absolute_rank', 'relative_score'], [(w[0], 1000 - w[1][0], w[1][2]) for w in words], os.path.join(top_words_out_dir, '%s_ranks.csv' % year))
def aggregate_reporters(): makedirs(dest_dir) aggregate = {} # get map of reporter key to canonical name in FLP db flp_keys = {} for reporter_list in REPORTERS.itervalues(): for reporter in reporter_list: fields = [reporter['cite_type'], reporter['name']] for k in reporter["editions"].keys(): flp_keys[cite_to_key(k)] = fields + [k] for k, v in reporter["variations"].items(): flp_keys[cite_to_key(k)] = fields + [v] # get map of reporter key to name in Juris-M db juris_keys = {} for json_file, label in [[ '../lib/jurism-abbreviations/primary-us.json', 'primary' ], ['../lib/jurism-abbreviations/secondary-us-bluebook.json', 'secondary']]: data = json.load( open(os.path.join(os.path.dirname(__file__), json_file))) for juris in data["xdata"].itervalues(): for full_name, short_name in juris["container-title"].iteritems(): key = cite_to_key(short_name) if key not in juris_keys: juris_keys[key] = [label, short_name, full_name] # get map of reporter key to CAP reporter cap_keys = {} for reporter in json.load( open( os.path.join(os.path.dirname(__file__), '../lib/reporter-list/reporters.json'))): key = cite_to_key(reporter['short']) if key not in cap_keys: cap_keys[key] = [reporter['reporter'], reporter['short']] # aggregate rows in our collected citations for csv_path in tqdm(sorted(glob(os.path.join(source_dir, "*.csv")))): csvreader = csv.DictReader(open(csv_path)) for row in csvreader: key = cite_to_key(row['Series']) if key in aggregate: aggregate[key]['Count'] += int(row['Count']) else: row['Examples'] = ['', '', ''] row['Count'] = int(row['Count']) row['Series'] = key row['FLP'] = flp_keys.get(key, ['', '', '']) row['juris'] = juris_keys.get(key, ['', '', '']) row['CAP'] = cap_keys.get(key, ['', '']) aggregate[key] = row aggregate[key]['Examples'] = [ row['Example %s' % i] for i in [1, 2, 3] if row.get('Example %s' % i) ] + aggregate[key]['Examples'] # write to CSV out = [[k, v['Count']] + v['Examples'][:3] + v['CAP'] + v['FLP'] + v['juris'] for k, v in aggregate.iteritems() if v['Count'] >= 100] out.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(dest_dir, 'aggregate.csv'), 'wb') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow([ 'Series', 'Count', 'Example 1', 'Example 2', 'Example 3', 'CAP Cite', 'CAP Full', 'FLP Type', 'FLP Name', 'FLP Cite', 'Juris-M Type', 'Juris-M Cite', 'Juris-M Full', ]) for row in out: csvwriter.writerow([unicode(s).encode("utf-8") for s in row])
import re import sys import textwrap from string import Template from helpers import makedirs from subprocess import check_output if len(sys.argv) != 5: print("Usage: python webdoc.py <html_> <md_dir> <nb_dir> <index_dir>") # python webdoc.py ../../pynest/examples ~/000-md ~/000-html ~/000-nb sys.exit(1) html_, md_dir, nb_dir, index_dir = sys.argv[1:] makedirs(md_dir) makedirs(nb_dir) ipynbpath = '../../doc/model_details' doc_dir = '../../doc' img_dir = '../userdoc/img' def examples_to_md(example, index_file, f_index): """Parse the examples.""" if example: base = os.path.splitext(example)[0] the_name = os.path.basename(base) mdfile = ('{}/{}.md'.format(md_dir, the_name)) """ Tear the file into lines and let's see where the \'\'\' are.