def ngram_volume(volume_path):
    counts = defaultdict(  # year
        lambda: defaultdict(  # ngram_len
            lambda: defaultdict(  # ngram
                int  # count
            )))

    for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
        pq = parse_file(case_xml_path)
        tokens = tokenize_text(get_case_text(pq))
        history = []
        case_year = get_decision_date(pq).year
        for i, item in enumerate(tokens):
            history.append(item)
            for ngram_len in [1,2,3]:
                if len(history) >= ngram_len:
                    counts[case_year][ngram_len]["\t".join(history[-ngram_len:])] += 1
            if i >= 2:
                del history[0]

    for year, ngram_lens in counts.iteritems():
        out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1), str(year))
        makedirs(out_dir)
        for ngram_len, data in ngram_lens.items():
            with open(os.path.join(out_dir, "%s.json" % ngram_len), 'w') as out:
                json.dump(data, out)
def search_volumes():
    makedirs(dest_dir)
    for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))):
        series_name = os.path.basename(series_path)
        known_series = defaultdict(
            lambda: {
                'count': 0, 'examples': []
            }
        )
        for volume_path in sorted(glob(os.path.join(series_path, "*"))):
            for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
                pq = parse_file(case_xml_path)
                text = get_case_text(pq)
                cites = cite_match.findall(text)
                for series in cites:
                    ks = known_series[series[1]]
                    ks['count'] += 1
                    if len(ks['examples']) < 3:
                        ks['examples'].append(" ".join(series))

        # write to CSV
        out = [[k, v['count']]+v['examples'] for k, v in known_series.iteritems()]
        out.sort(key=lambda x: x[1], reverse=True)
        with open(os.path.join(dest_dir, '%s.csv' % series_name), 'wb') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(['Series', 'Count', 'Example 1', 'Example 2', 'Example 3'])
            for row in out:
                csvwriter.writerow(row)
Exemple #3
0
def search_volumes():
    makedirs(dest_dir)
    for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))):
        series_name = os.path.basename(series_path)
        known_series = defaultdict(lambda: {'count': 0, 'examples': []})
        for volume_path in sorted(glob(os.path.join(series_path, "*"))):
            for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
                pq = parse_file(case_xml_path)
                text = get_case_text(pq)
                cites = cite_match.findall(text)
                for series in cites:
                    ks = known_series[series[1]]
                    ks['count'] += 1
                    if len(ks['examples']) < 3:
                        ks['examples'].append(" ".join(series))

        # write to CSV
        out = [[k, v['count']] + v['examples']
               for k, v in known_series.iteritems()]
        out.sort(key=lambda x: x[1], reverse=True)
        with open(os.path.join(dest_dir, '%s.csv' % series_name),
                  'wb') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(
                ['Series', 'Count', 'Example 1', 'Example 2', 'Example 3'])
            for row in out:
                csvwriter.writerow(row)
def ngram_volume(volume_path):
    counts = defaultdict(  # year
        lambda: defaultdict(  # ngram_len
            lambda: defaultdict(  # ngram
                int  # count
            )))

    for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
        pq = parse_file(case_xml_path)
        tokens = tokenize_text(get_case_text(pq))
        history = []
        case_year = get_decision_date(pq).year
        for i, item in enumerate(tokens):
            history.append(item)
            for ngram_len in [1, 2, 3]:
                if len(history) >= ngram_len:
                    counts[case_year][ngram_len]["\t".join(
                        history[-ngram_len:])] += 1
            if i >= 2:
                del history[0]

    for year, ngram_lens in counts.iteritems():
        out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1),
                               str(year))
        makedirs(out_dir)
        for ngram_len, data in ngram_lens.items():
            with open(os.path.join(out_dir, "%s.json" % ngram_len),
                      'w') as out:
                json.dump(data, out)
def wordcloud_all_jurisdictions():

    for jurisdiction_path in glob(os.path.join(source_dir, "Illinois")):
        print "Processing", jurisdiction_path
        out_dir = jurisdiction_path.replace(source_dir, dest_dir, 1)
        top_words_out_dir = jurisdiction_path.replace(source_dir, top_words_dir, 1)
        makedirs(out_dir)
        makedirs(top_words_out_dir)

        # write global
        global_freqs = process_word_dict(json.load(open(os.path.join(jurisdiction_path, 'totals/1.json'))))
        save_wordcloud(global_freqs, os.path.join(out_dir, 'totals.png'))

        # load year data
        bare_top_words_by_year = {}
        for year_path in tqdm(glob(os.path.join(jurisdiction_path, "*"))):
            year = os.path.basename(year_path)
            if year == 'totals':
                continue

            year_freqs = process_word_dict(json.load(open(os.path.join(year_path, '1.json'))))

            # skip years with few cases (probably typos)
            if sum(w[1] for w in year_freqs)<10000:
                continue

            bare_top_words_by_year[year] = [w[0] for w in year_freqs]

        # calculate global rankings
        word_to_ranking = defaultdict(lambda: 0)
        for year, words in tqdm(bare_top_words_by_year.iteritems()):
            for pos, word in enumerate(words):
                word_to_ranking[word] += (1000 - pos)
        average_rank = dict([(word, ranking/len(bare_top_words_by_year)) for word, ranking in word_to_ranking.iteritems()])

        # write average ranks CSV
        save_top_list(['word','average_rank'],
                      [[word, 1000-rank] for word, rank in average_rank.iteritems()],
                      os.path.join(top_words_out_dir, 'average_ranks.csv'))

        # calculate scores by year
        year_to_volatile_words = {}
        for year, words in tqdm(bare_top_words_by_year.iteritems()):
            word_to_ranking_delta = {}
            for pos, word in enumerate(words):
                rank_for_year = 1000 - pos
                word_to_ranking_delta[word] = (rank_for_year, average_rank[word], rank_for_year - average_rank[word])
                year_to_volatile_words[year] = sorted(word_to_ranking_delta.items(), key=lambda x: x[1][2], reverse=True)

        # wordclouds
        for year, words in tqdm(year_to_volatile_words.iteritems()):
            freqs = [(w[0], w[1][2]) for w in words[:200]]
            save_wordcloud(freqs, os.path.join(out_dir, '%s.png' % year))

            # write year ranks CSV
            save_top_list(['word', 'absolute_rank', 'relative_score'],
                          [(w[0], 1000-w[1][0], w[1][2]) for w in words],
                          os.path.join(top_words_out_dir, '%s_ranks.csv' % year))
def bam_sort(bam_filename, sorted_bam_filename, tempdir, mem="2G", **kwargs):
    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1',
        'SortSam', 'INPUT=' + bam_filename, 'OUTPUT=' + sorted_bam_filename,
        'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT',
        'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def tokenize_case(case_xml_path):
    out_path = case_xml_path.replace(source_dir, dest_dir, 1).replace('.xml', '.txt')
    if os.path.exists(out_path):
        return
    pq = parse_file(case_xml_path)
    case_text = get_case_text(pq)
    tokens = nltk.word_tokenize(case_text)
    makedirs(os.path.dirname(out_path))
    with open(out_path, 'w') as out:
        out.write(u"\n".join(tokens).encode("utf8"))
def search_front_matter():
    for jurisdiction_path in sorted(glob(os.path.join(source_dir, "*"))):
        makedirs(jurisdiction_path.replace(source_dir, dest_dir))
        for series_path in glob(os.path.join(jurisdiction_path, "*")):
            print series_path
            try:
                out = u""
                for volume_path in sorted(glob(os.path.join(series_path, "*")), key=lambda x: int(x.rsplit('/',1)[1])):

                    # load first case in volume
                    case_paths = sorted(glob(os.path.join(volume_path, "*.xml")))
                    if not case_paths:
                        continue
                    first_case_path = case_paths[0]
                    pq = parse_file(first_case_path)

                    # stop processing volume after 1923
                    year = get_decision_date(pq).year
                    if year > 1923:
                        break

                    # get first alto file for first case
                    first_case_alto_file = pq('METS|fileGrp[USE="alto"] METS|FLocat')[0].attrib[qn("xlink|href")][3:]
                    first_case_alto_name = os.path.basename(first_case_alto_file)

                    # get directory for alto files for volume
                    case_id = pq("case|case").attr('caseid')
                    alto_dir = os.path.dirname(os.path.join(raw_source_dir, case_id, first_case_alto_file)).replace('_0001', '_redacted')

                    # process alto files until we hit the one for the first case in the volume
                    for alto_path in sorted(glob(os.path.join(alto_dir, "*"))):
                        if alto_path.endswith(first_case_alto_name):
                            break

                        # only bother parsing XML if we find 'reporter' in the text of the alto file somewhere
                        alto_data = open(alto_path).read()
                        if 'reporter' not in alto_data.lower():
                            continue
                        alto_pq = PyQuery(alto_data, parser='xml', namespaces=namespaces)

                        # extract OCR'd text from alto XML
                        alto_text = " ".join(x.attrib["CONTENT"] for x in alto_pq('alto|String'))

                        # if page has more than fifty lowercase words, less than 15 uppercase words (usually a list of judges),
                        # and less than 30 periods (usually a table of contents), print citation and page text
                        if len(re.findall(r'\b[a-z]+\b', alto_text))>50 and len(re.findall(r'\b[A-Z][A-Z]+\b', alto_text))<15 and len(re.findall(r'\.', alto_text))<30:
                            volume_cite = pq('case|citation[category="official"]').text().rsplit(" ",1)[0]
                            out += "%s\n%s\n%s\n\n" % (alto_path, volume_cite, alto_text)

                # write out all matched pages for series
                if out:
                    open((series_path.replace(source_dir, dest_dir) + ".txt").replace('..', '.'), "w").write(out.encode('utf8'))

            except Exception as e:
                print "Skipping -- %s" % e
Exemple #9
0
def sort_case(case_xml_path):
    pq = parse_file(case_xml_path)
    jurisdiction = get_jurisdiction(pq)
    citation = get_citation(pq)
    cite_parts = citation.split(" ")
    volume, reporter, page_number = cite_parts[0], " ".join(cite_parts[1:-1]), cite_parts[-1]
    volume_dir = os.path.join(jurisdiction, reporter, volume)
    makedirs(volume_dir)
    dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path))
    if os.path.exists(dest_path):
        os.remove(dest_path)
    os.link(case_xml_path, dest_path)
Exemple #10
0
def sort_case(case_xml_path):
    pq = parse_file(case_xml_path)
    jurisdiction = get_jurisdiction(pq)
    citation = get_citation(pq)
    cite_parts = citation.split(" ")
    volume, reporter, page_number = cite_parts[0], " ".join(
        cite_parts[1:-1]), cite_parts[-1]
    volume_dir = os.path.join(jurisdiction, reporter, volume)
    makedirs(volume_dir)
    dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path))
    if os.path.exists(dest_path):
        os.remove(dest_path)
    os.link(case_xml_path, dest_path)
def bam_markdups(bam_filename,
                 markduped_bam_filename,
                 metrics_filename,
                 tempdir,
                 mem="2G",
                 **kwargs):
    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1',
        'MarkDuplicates', 'INPUT=' + bam_filename,
        'OUTPUT=' + markduped_bam_filename, 'METRICS_FILE=' + metrics_filename,
        'REMOVE_DUPLICATES=False', 'ASSUME_SORTED=True',
        'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir,
        'MAX_RECORDS_IN_RAM=150000', **kwargs)
Exemple #12
0
def merge_limerick_lines():
    merged = defaultdict(  # line_type
        lambda: defaultdict(  # emphasis_pattern
            lambda: defaultdict(  # last_syllable
                lambda: defaultdict(  # last_token
                    list  # sentence
                ))))

    for path in tqdm(glob(os.path.join(source_dir, "*"))):
        line_types = json.load(open(path))
        for line_type, emphasis_patterns in line_types.iteritems():
            for emphasis_pattern, last_syllables in emphasis_patterns.iteritems(
            ):
                for last_syllable, last_tokens in last_syllables.iteritems():
                    for last_token, lines in last_tokens.iteritems():
                        merged[line_type][emphasis_pattern][last_syllable][
                            last_token.lower()].extend(lines)

    filtered = defaultdict(  # line_type
        lambda: defaultdict(  # emphasis_pattern
            lambda: defaultdict(  # last_syllable
                lambda: dict)))

    for line_type, emphasis_patterns in merged.iteritems():
        for emphasis_pattern, last_syllables in emphasis_patterns.iteritems():

            # skip long lines that don't include '1**1**1'
            if line_type == 'long' and '1**1**1' not in emphasis_pattern:  # len(emphasis_pattern)<5:
                continue

            # skip short lines that are too short
            if line_type == 'short' and len(emphasis_pattern) < 4:
                continue

            for last_syllable, last_tokens in last_syllables.iteritems():

                # skip groups with insufficient options
                if (line_type == 'long' and len(last_tokens) < 3) or (
                        line_type == 'short' and len(last_tokens) < 2):
                    continue

                filtered[line_type][emphasis_pattern][
                    last_syllable] = last_tokens

    makedirs(dest_dir)
    json.dump(filtered, open(os.path.join(dest_dir, 'limerick_lines.json'),
                             'w'))
def bam_collect_gc_metrics(bam_filename,
                           ref_genome,
                           metrics_filename,
                           summary_filename,
                           chart_filename,
                           tempdir,
                           mem="2G",
                           **kwargs):
    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1',
        'CollectGcBiasMetrics', 'INPUT=' + bam_filename,
        'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome,
        'S=' + summary_filename, 'CHART_OUTPUT=' + chart_filename,
        'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir,
        'MAX_RECORDS_IN_RAM=150000', **kwargs)
Exemple #14
0
def sort_all_volumes():
    # make everything in a temp dir
    makedirs(tmp_dest_dir)
    os.chdir(tmp_dest_dir)

    dirs = sorted(glob(os.path.join(source_dir, "from_vendor/*")))
    for i, volume_path in enumerate(tqdm(dirs)):

        # skip dirs that are superceded by the following version
        base_name = volume_path.split('_redacted_',1)[0]
        if i < len(dirs)-1 and dirs[i+1].startswith(base_name):
            #print "Skipping", volume_path
            continue

        sort_volume(volume_path)

    # swap temp dir in place of existing dir
    shutil.rmtree(dest_dir)
    os.rename(tmp_dest_dir, dest_dir)
Exemple #15
0
def sort_all_volumes():
    # make everything in a temp dir
    makedirs(tmp_dest_dir)
    os.chdir(tmp_dest_dir)

    dirs = sorted(glob(os.path.join(source_dir, "from_vendor/*")))
    for i, volume_path in enumerate(tqdm(dirs)):

        # skip dirs that are superceded by the following version
        base_name = volume_path.split('_redacted_', 1)[0]
        if i < len(dirs) - 1 and dirs[i + 1].startswith(base_name):
            #print "Skipping", volume_path
            continue

        sort_volume(volume_path)

    # swap temp dir in place of existing dir
    shutil.rmtree(dest_dir)
    os.rename(tmp_dest_dir, dest_dir)
def merge_limerick_lines():
    merged = defaultdict(  # line_type
        lambda: defaultdict(  # emphasis_pattern
            lambda: defaultdict(  # last_syllable
                lambda: defaultdict(  # last_token
                    list  # sentence
                ))))

    for path in tqdm(glob(os.path.join(source_dir, "*"))):
        line_types = json.load(open(path))
        for line_type, emphasis_patterns in line_types.iteritems():
            for emphasis_pattern, last_syllables in emphasis_patterns.iteritems():
                for last_syllable, last_tokens in last_syllables.iteritems():
                    for last_token, lines in last_tokens.iteritems():
                        merged[line_type][emphasis_pattern][last_syllable][last_token.lower()].extend(lines)

    filtered = defaultdict(  # line_type
        lambda: defaultdict(  # emphasis_pattern
            lambda: defaultdict(  # last_syllable
                lambda: dict)))

    for line_type, emphasis_patterns in merged.iteritems():
        for emphasis_pattern, last_syllables in emphasis_patterns.iteritems():

            # skip long lines that don't include '1**1**1'
            if line_type=='long' and '1**1**1' not in emphasis_pattern:  # len(emphasis_pattern)<5:
                continue

            # skip short lines that are too short
            if line_type=='short' and len(emphasis_pattern)<4:
                continue

            for last_syllable, last_tokens in last_syllables.iteritems():

                # skip groups with insufficient options
                if (line_type=='long' and len(last_tokens)<3) or (line_type=='short' and len(last_tokens)<2):
                    continue

                filtered[line_type][emphasis_pattern][last_syllable] = last_tokens

    makedirs(dest_dir)
    json.dump(filtered, open(os.path.join(dest_dir, 'limerick_lines.json'), 'w'))
def bam_collect_wgs_metrics(bam_filename,
                            ref_genome,
                            metrics_filename,
                            config,
                            tempdir,
                            mem="2G",
                            **kwargs):
    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1',
        'CollectWgsMetrics', 'INPUT=' + bam_filename,
        'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome,
        'MINIMUM_BASE_QUALITY=' +
        str(config['picard_wgs_params']['min_bqual']),
        'MINIMUM_MAPPING_QUALITY=' +
        str(config['picard_wgs_params']['min_mqual']), 'COVERAGE_CAP=500',
        'VALIDATION_STRINGENCY=LENIENT', 'COUNT_UNPAIRED=' +
        ('True' if config['picard_wgs_params']['count_unpaired'] else 'False'),
        'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def bam_collect_insert_metrics(bam_filename,
                               flagstat_metrics_filename,
                               metrics_filename,
                               histogram_filename,
                               tempdir,
                               mem="2G",
                               **kwargs):
    # Check if any paired reads exist
    has_paired = None
    with open(flagstat_metrics_filename) as f:
        for line in f:
            if 'properly paired' in line:
                if line.startswith('0 '):
                    has_paired = False
                else:
                    has_paired = True

    if has_paired is None:
        raise Exception(
            'Unable to determine number of properly paired reads from {}'.
            format(flagstat_metrics_filename))

    if not has_paired:
        with open(metrics_filename, 'w') as f:
            f.write('## FAILED: No properly paired reads\n')
        with open(histogram_filename, 'w'):
            pass
        return

    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1',
        'CollectInsertSizeMetrics', 'INPUT=' + bam_filename,
        'OUTPUT=' + metrics_filename, 'HISTOGRAM_FILE=' + histogram_filename,
        'ASSUME_SORTED=True', 'VALIDATION_STRINGENCY=LENIENT',
        'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', **kwargs)
def produce_fastqc_report(fastq_filename, output_html, output_plots, temp_dir,
                          **kwargs):
    makedirs(temp_dir)

    pypeliner.commandline.execute('fastqc', '--outdir=' + temp_dir,
                                  fastq_filename, **kwargs)

    fastq_basename = os.path.basename(fastq_filename)
    if fastq_basename.endswith(".fastq.gz"):
        fastq_basename = fastq_basename.replace(".fastq.gz", "")
    elif fastq_basename.endswith(".fq.gz"):
        fastq_basename = fastq_basename.replace(".fq.gz", "")
    elif fastq_basename.endswith(".fq"):
        fastq_basename = fastq_basename.replace(".fq", "")
    elif fastq_basename.endswith(".fastq"):
        fastq_basename = fastq_basename.replace(".fastq", "")
    else:
        raise Exception("Unknown file type")

    output_basename = os.path.join(temp_dir, fastq_basename)

    shutil.move(output_basename + '_fastqc.zip', output_plots)
    shutil.move(output_basename + '_fastqc.html', output_html)
def save_counts(counts, jurisdiction_path, subdir):
    out_dir = os.path.join(jurisdiction_path.replace(source_dir, dest_dir, 1), str(subdir))
    makedirs(out_dir)
    for ngram_length, data in counts.iteritems():
        with open(os.path.join(out_dir, "%s.json" % ngram_length), 'w') as out:
            json.dump(data, out)
        "Test File name = {}, Preprocessed Test input size {}, Test Input size before preprocessing = {}"
        .format(testfile, test_input.size(), mean_norm_data_df.shape[0]))

    #HYPERPARAMETERS 2.
    NUM_INSTANCES = data_df.shape[0]
    BATCHSIZE = int(args.batchsize)
    BATCHSIZE = min(NUM_INSTANCES, BATCHSIZE)
    NUM_BATCHES = int(NUM_INSTANCES / BATCHSIZE)
    root.info("Results output dir path: {}".format(RESULTS_OUTPUT_DIR_NAME))
    root.info("Sequence Length = {}, TOTAL NUMBER OF INSTANCES = {}".format(
        ENCODER_SEQUENCE_LENGTH, NUM_INSTANCES))

    #Check whether figure directories exist. If not create them.
    TESTOUTPUTDIR = RESULTS_OUTPUT_DIR_NAME + "results_test/" + os.path.basename(
        testfile).split(".csv")[0]
    makedirs(TESTOUTPUTDIR)
    TESTOUTPUTFILE = TESTOUTPUTDIR + "/" + "test_mse.txt"

    root.info("Results output dir path: {}".format(RESULTS_OUTPUT_DIR_NAME))
    root.info("Sequence Length = {}, TOTAL NUMBER OF INSTANCES = {}".format(
        ENCODER_SEQUENCE_LENGTH, NUM_INSTANCES))

    #Instantiate Attention Encoder
    encoder = EncoderHierAttn(INPUT_SIZE, HIDDEN_SIZE, ENCODER_SEQUENCE_LENGTH,
                              NUM_LAYERS, args.rnnobject, root)
    encoder.load_state_dict(torch.load(MODEL_OUTPUT_DIR + "encoder.pt"))
    encoder.eval()
    encoder.cuda()  ##CUDA

    #Instantiate Attention Decoder
    decoder = DecoderHierAttn(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, OUTPUT_SIZE,
Exemple #22
0
def make_output_dir(key):
    output_dir = os.path.join(dest_dir, key)
    makedirs(output_dir)
    return output_dir
 def write_lines():
     out_path = jurisdiction_path.replace(source_dir, dest_dir, 1).rstrip('/')+'.json'
     makedirs(os.path.dirname(out_path))
     with open(out_path, 'w') as out:
         json.dump(lines, out)
BATCH_SIZE = int(args.batchsize)
DROPOUT = float(args.dropout)
SLIDING_ATTENTION = json.loads(args.slidingattention.lower())
HIERARCHICAL_ATTN_METHOD = args.hierattnmethod
SLIDING_WINSIZE = int(args.slidingwindowsize)

# CREATE
RESULTS_OUTPUT_DIR_NAME = args.predictionsoutputdir + "/" + "SEQUENCE_LENGTH_{}_NUMLAYERS_{}_HIDDEN_SIZE_{}_DROPOUT_{}_TEACHERFORCING_{}_RNNOBJECT_{}_ITERNUM_{}".format(
    ENCODER_SEQUENCE_LENGTH, NUM_LAYERS, HIDDEN_SIZE, DROPOUT,
    args.teacherforcing, args.rnnobject, args.iternum) + "/"
print("Results Output Dir Name = {}".format(RESULTS_OUTPUT_DIR_NAME))
MODEL_OUTPUT_DIR = RESULTS_OUTPUT_DIR_NAME + "model/"
VALIDATION_OUTPUT_FILE = RESULTS_OUTPUT_DIR_NAME + "validation_mse.txt"
#Create figures train and validation directories.
print("Model_output_dir = {}".format(MODEL_OUTPUT_DIR))
makedirs(MODEL_OUTPUT_DIR)
makedirs(RESULTS_OUTPUT_DIR_NAME + "figures_train")
makedirs(RESULTS_OUTPUT_DIR_NAME + "figures_validation")

root.info("Results output dir path: {}".format(RESULTS_OUTPUT_DIR_NAME))
"""
PREPROCESS TIMESERIES DATA. 
 If you want to run the model on a new dataset and select a sub-set of columns, add a condition to the following set of conditional statements 

 `elif "newdatasetsubstring" in args.datasetname:  
    ...Do Something...`

"""
if "tep" in args.datasetname:
    #columnsofinterest=['MEAS_A_Feed', 'MEAS_D_Feed', 'MEAS_E_Feed', 'MEAS_A_C Feed','MEAS_Recycle_flow', 'MEAS_Reactor_feed', 'MEAS_Reactor_pressure','MEAS_Reactor_level', 'MEAS_Reactor_temperature', 'MEAS_Purge_rate']
    columnsofinterest = [
Exemple #25
0
def wordcloud_all_jurisdictions():

    for jurisdiction_path in glob(os.path.join(source_dir, "Illinois")):
        print "Processing", jurisdiction_path
        out_dir = jurisdiction_path.replace(source_dir, dest_dir, 1)
        top_words_out_dir = jurisdiction_path.replace(source_dir,
                                                      top_words_dir, 1)
        makedirs(out_dir)
        makedirs(top_words_out_dir)

        # write global
        global_freqs = process_word_dict(
            json.load(open(os.path.join(jurisdiction_path, 'totals/1.json'))))
        save_wordcloud(global_freqs, os.path.join(out_dir, 'totals.png'))

        # load year data
        bare_top_words_by_year = {}
        for year_path in tqdm(glob(os.path.join(jurisdiction_path, "*"))):
            year = os.path.basename(year_path)
            if year == 'totals':
                continue

            year_freqs = process_word_dict(
                json.load(open(os.path.join(year_path, '1.json'))))

            # skip years with few cases (probably typos)
            if sum(w[1] for w in year_freqs) < 10000:
                continue

            bare_top_words_by_year[year] = [w[0] for w in year_freqs]

        # calculate global rankings
        word_to_ranking = defaultdict(lambda: 0)
        for year, words in tqdm(bare_top_words_by_year.iteritems()):
            for pos, word in enumerate(words):
                word_to_ranking[word] += (1000 - pos)
        average_rank = dict([(word, ranking / len(bare_top_words_by_year))
                             for word, ranking in word_to_ranking.iteritems()])

        # write average ranks CSV
        save_top_list(['word', 'average_rank'],
                      [[word, 1000 - rank]
                       for word, rank in average_rank.iteritems()],
                      os.path.join(top_words_out_dir, 'average_ranks.csv'))

        # calculate scores by year
        year_to_volatile_words = {}
        for year, words in tqdm(bare_top_words_by_year.iteritems()):
            word_to_ranking_delta = {}
            for pos, word in enumerate(words):
                rank_for_year = 1000 - pos
                word_to_ranking_delta[word] = (rank_for_year,
                                               average_rank[word],
                                               rank_for_year -
                                               average_rank[word])
                year_to_volatile_words[year] = sorted(
                    word_to_ranking_delta.items(),
                    key=lambda x: x[1][2],
                    reverse=True)

        # wordclouds
        for year, words in tqdm(year_to_volatile_words.iteritems()):
            freqs = [(w[0], w[1][2]) for w in words[:200]]
            save_wordcloud(freqs, os.path.join(out_dir, '%s.png' % year))

            # write year ranks CSV
            save_top_list(['word', 'absolute_rank', 'relative_score'],
                          [(w[0], 1000 - w[1][0], w[1][2]) for w in words],
                          os.path.join(top_words_out_dir,
                                       '%s_ranks.csv' % year))
def aggregate_reporters():
    makedirs(dest_dir)
    aggregate = {}

    # get map of reporter key to canonical name in FLP db
    flp_keys = {}
    for reporter_list in REPORTERS.itervalues():
        for reporter in reporter_list:
            fields = [reporter['cite_type'], reporter['name']]
            for k in reporter["editions"].keys():
                flp_keys[cite_to_key(k)] = fields + [k]
            for k, v in reporter["variations"].items():
                flp_keys[cite_to_key(k)] = fields + [v]

    # get map of reporter key to name in Juris-M db
    juris_keys = {}
    for json_file, label in [[
            '../lib/jurism-abbreviations/primary-us.json', 'primary'
    ], ['../lib/jurism-abbreviations/secondary-us-bluebook.json',
            'secondary']]:
        data = json.load(
            open(os.path.join(os.path.dirname(__file__), json_file)))
        for juris in data["xdata"].itervalues():
            for full_name, short_name in juris["container-title"].iteritems():
                key = cite_to_key(short_name)
                if key not in juris_keys:
                    juris_keys[key] = [label, short_name, full_name]

    # get map of reporter key to CAP reporter
    cap_keys = {}
    for reporter in json.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             '../lib/reporter-list/reporters.json'))):
        key = cite_to_key(reporter['short'])
        if key not in cap_keys:
            cap_keys[key] = [reporter['reporter'], reporter['short']]

    # aggregate rows in our collected citations
    for csv_path in tqdm(sorted(glob(os.path.join(source_dir, "*.csv")))):
        csvreader = csv.DictReader(open(csv_path))
        for row in csvreader:
            key = cite_to_key(row['Series'])
            if key in aggregate:
                aggregate[key]['Count'] += int(row['Count'])
            else:
                row['Examples'] = ['', '', '']
                row['Count'] = int(row['Count'])
                row['Series'] = key
                row['FLP'] = flp_keys.get(key, ['', '', ''])
                row['juris'] = juris_keys.get(key, ['', '', ''])
                row['CAP'] = cap_keys.get(key, ['', ''])

                aggregate[key] = row

            aggregate[key]['Examples'] = [
                row['Example %s' % i]
                for i in [1, 2, 3] if row.get('Example %s' % i)
            ] + aggregate[key]['Examples']

    # write to CSV
    out = [[k, v['Count']] + v['Examples'][:3] + v['CAP'] + v['FLP'] +
           v['juris'] for k, v in aggregate.iteritems() if v['Count'] >= 100]
    out.sort(key=lambda x: x[1], reverse=True)
    with open(os.path.join(dest_dir, 'aggregate.csv'), 'wb') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow([
            'Series',
            'Count',
            'Example 1',
            'Example 2',
            'Example 3',
            'CAP Cite',
            'CAP Full',
            'FLP Type',
            'FLP Name',
            'FLP Cite',
            'Juris-M Type',
            'Juris-M Cite',
            'Juris-M Full',
        ])
        for row in out:
            csvwriter.writerow([unicode(s).encode("utf-8") for s in row])
Exemple #27
0
def make_output_dir(key):
    output_dir = os.path.join(dest_dir, key)
    makedirs(output_dir)
    return output_dir
Exemple #28
0
import re
import sys
import textwrap
from string import Template
from helpers import makedirs
from subprocess import check_output

if len(sys.argv) != 5:
    print("Usage: python webdoc.py <html_> <md_dir> <nb_dir> <index_dir>")
    # python webdoc.py ../../pynest/examples ~/000-md ~/000-html ~/000-nb
    sys.exit(1)

html_, md_dir, nb_dir, index_dir = sys.argv[1:]


makedirs(md_dir)
makedirs(nb_dir)

ipynbpath = '../../doc/model_details'
doc_dir = '../../doc'
img_dir = '../userdoc/img'


def examples_to_md(example, index_file, f_index):
    """Parse the examples."""
    if example:
        base = os.path.splitext(example)[0]
        the_name = os.path.basename(base)
        mdfile = ('{}/{}.md'.format(md_dir, the_name))
        """
        Tear the file into lines and let's see where the \'\'\' are.