Esempio n. 1
0
def poll():
    pop_messages = pop_feed(
        settings.POLL_USERNAME, 
        settings.POLL_PASSWORD, 
        settings.POLL_HOST
    ) 
    #pop_messages = disk_feed() 
    messages = grab_messages(pop_messages)
    for message in messages:
        try:
            print "Processing messing"
            for (filename, original_filename) in message["files"]:
                parse_file(filename)
        except Exception, e:
            print e
Esempio n. 2
0
def ngram_volume(volume_path):
    counts = defaultdict(  # year
        lambda: defaultdict(  # ngram_len
            lambda: defaultdict(  # ngram
                int  # count
            )))

    for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
        pq = parse_file(case_xml_path)
        tokens = tokenize_text(get_case_text(pq))
        history = []
        case_year = get_decision_date(pq).year
        for i, item in enumerate(tokens):
            history.append(item)
            for ngram_len in [1,2,3]:
                if len(history) >= ngram_len:
                    counts[case_year][ngram_len]["\t".join(history[-ngram_len:])] += 1
            if i >= 2:
                del history[0]

    for year, ngram_lens in counts.iteritems():
        out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1), str(year))
        makedirs(out_dir)
        for ngram_len, data in ngram_lens.items():
            with open(os.path.join(out_dir, "%s.json" % ngram_len), 'w') as out:
                json.dump(data, out)
Esempio n. 3
0
def search_volumes():
    makedirs(dest_dir)
    for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))):
        series_name = os.path.basename(series_path)
        known_series = defaultdict(
            lambda: {
                'count': 0, 'examples': []
            }
        )
        for volume_path in sorted(glob(os.path.join(series_path, "*"))):
            for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
                pq = parse_file(case_xml_path)
                text = get_case_text(pq)
                cites = cite_match.findall(text)
                for series in cites:
                    ks = known_series[series[1]]
                    ks['count'] += 1
                    if len(ks['examples']) < 3:
                        ks['examples'].append(" ".join(series))

        # write to CSV
        out = [[k, v['count']]+v['examples'] for k, v in known_series.iteritems()]
        out.sort(key=lambda x: x[1], reverse=True)
        with open(os.path.join(dest_dir, '%s.csv' % series_name), 'wb') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(['Series', 'Count', 'Example 1', 'Example 2', 'Example 3'])
            for row in out:
                csvwriter.writerow(row)
Esempio n. 4
0
def ngram_volume(volume_path):
    counts = defaultdict(  # year
        lambda: defaultdict(  # ngram_len
            lambda: defaultdict(  # ngram
                int  # count
            )))

    for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
        pq = parse_file(case_xml_path)
        tokens = tokenize_text(get_case_text(pq))
        history = []
        case_year = get_decision_date(pq).year
        for i, item in enumerate(tokens):
            history.append(item)
            for ngram_len in [1, 2, 3]:
                if len(history) >= ngram_len:
                    counts[case_year][ngram_len]["\t".join(
                        history[-ngram_len:])] += 1
            if i >= 2:
                del history[0]

    for year, ngram_lens in counts.iteritems():
        out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1),
                               str(year))
        makedirs(out_dir)
        for ngram_len, data in ngram_lens.items():
            with open(os.path.join(out_dir, "%s.json" % ngram_len),
                      'w') as out:
                json.dump(data, out)
Esempio n. 5
0
def search_volumes():
    makedirs(dest_dir)
    for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))):
        series_name = os.path.basename(series_path)
        known_series = defaultdict(lambda: {'count': 0, 'examples': []})
        for volume_path in sorted(glob(os.path.join(series_path, "*"))):
            for case_xml_path in glob(os.path.join(volume_path, '*.xml')):
                pq = parse_file(case_xml_path)
                text = get_case_text(pq)
                cites = cite_match.findall(text)
                for series in cites:
                    ks = known_series[series[1]]
                    ks['count'] += 1
                    if len(ks['examples']) < 3:
                        ks['examples'].append(" ".join(series))

        # write to CSV
        out = [[k, v['count']] + v['examples']
               for k, v in known_series.iteritems()]
        out.sort(key=lambda x: x[1], reverse=True)
        with open(os.path.join(dest_dir, '%s.csv' % series_name),
                  'wb') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(
                ['Series', 'Count', 'Example 1', 'Example 2', 'Example 3'])
            for row in out:
                csvwriter.writerow(row)
Esempio n. 6
0
def tokenize_case(case_xml_path):
    out_path = case_xml_path.replace(source_dir, dest_dir, 1).replace('.xml', '.txt')
    if os.path.exists(out_path):
        return
    pq = parse_file(case_xml_path)
    case_text = get_case_text(pq)
    tokens = nltk.word_tokenize(case_text)
    makedirs(os.path.dirname(out_path))
    with open(out_path, 'w') as out:
        out.write(u"\n".join(tokens).encode("utf8"))
def search_front_matter():
    for jurisdiction_path in sorted(glob(os.path.join(source_dir, "*"))):
        makedirs(jurisdiction_path.replace(source_dir, dest_dir))
        for series_path in glob(os.path.join(jurisdiction_path, "*")):
            print series_path
            try:
                out = u""
                for volume_path in sorted(glob(os.path.join(series_path, "*")), key=lambda x: int(x.rsplit('/',1)[1])):

                    # load first case in volume
                    case_paths = sorted(glob(os.path.join(volume_path, "*.xml")))
                    if not case_paths:
                        continue
                    first_case_path = case_paths[0]
                    pq = parse_file(first_case_path)

                    # stop processing volume after 1923
                    year = get_decision_date(pq).year
                    if year > 1923:
                        break

                    # get first alto file for first case
                    first_case_alto_file = pq('METS|fileGrp[USE="alto"] METS|FLocat')[0].attrib[qn("xlink|href")][3:]
                    first_case_alto_name = os.path.basename(first_case_alto_file)

                    # get directory for alto files for volume
                    case_id = pq("case|case").attr('caseid')
                    alto_dir = os.path.dirname(os.path.join(raw_source_dir, case_id, first_case_alto_file)).replace('_0001', '_redacted')

                    # process alto files until we hit the one for the first case in the volume
                    for alto_path in sorted(glob(os.path.join(alto_dir, "*"))):
                        if alto_path.endswith(first_case_alto_name):
                            break

                        # only bother parsing XML if we find 'reporter' in the text of the alto file somewhere
                        alto_data = open(alto_path).read()
                        if 'reporter' not in alto_data.lower():
                            continue
                        alto_pq = PyQuery(alto_data, parser='xml', namespaces=namespaces)

                        # extract OCR'd text from alto XML
                        alto_text = " ".join(x.attrib["CONTENT"] for x in alto_pq('alto|String'))

                        # if page has more than fifty lowercase words, less than 15 uppercase words (usually a list of judges),
                        # and less than 30 periods (usually a table of contents), print citation and page text
                        if len(re.findall(r'\b[a-z]+\b', alto_text))>50 and len(re.findall(r'\b[A-Z][A-Z]+\b', alto_text))<15 and len(re.findall(r'\.', alto_text))<30:
                            volume_cite = pq('case|citation[category="official"]').text().rsplit(" ",1)[0]
                            out += "%s\n%s\n%s\n\n" % (alto_path, volume_cite, alto_text)

                # write out all matched pages for series
                if out:
                    open((series_path.replace(source_dir, dest_dir) + ".txt").replace('..', '.'), "w").write(out.encode('utf8'))

            except Exception as e:
                print "Skipping -- %s" % e
Esempio n. 8
0
def sort_case(case_xml_path):
    pq = parse_file(case_xml_path)
    jurisdiction = get_jurisdiction(pq)
    citation = get_citation(pq)
    cite_parts = citation.split(" ")
    volume, reporter, page_number = cite_parts[0], " ".join(cite_parts[1:-1]), cite_parts[-1]
    volume_dir = os.path.join(jurisdiction, reporter, volume)
    makedirs(volume_dir)
    dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path))
    if os.path.exists(dest_path):
        os.remove(dest_path)
    os.link(case_xml_path, dest_path)
Esempio n. 9
0
def sort_case(case_xml_path):
    pq = parse_file(case_xml_path)
    jurisdiction = get_jurisdiction(pq)
    citation = get_citation(pq)
    cite_parts = citation.split(" ")
    volume, reporter, page_number = cite_parts[0], " ".join(
        cite_parts[1:-1]), cite_parts[-1]
    volume_dir = os.path.join(jurisdiction, reporter, volume)
    makedirs(volume_dir)
    dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path))
    if os.path.exists(dest_path):
        os.remove(dest_path)
    os.link(case_xml_path, dest_path)
Esempio n. 10
0
def limerick_jurisdiction(jurisdiction_path):
    sentence_count = 0
    lines = defaultdict(  # line_type
        lambda: defaultdict(  # emphasis_pattern
            lambda: defaultdict(  # last_syllable
                lambda: defaultdict(  # last_token
                    list  # sentence
            ))))

    sentence_lookup = set()

    def write_lines():
        out_path = jurisdiction_path.replace(source_dir, dest_dir, 1).rstrip('/')+'.json'
        makedirs(os.path.dirname(out_path))
        with open(out_path, 'w') as out:
            json.dump(lines, out)

    case_xml_paths = glob(os.path.join(jurisdiction_path, '*/*/*.xml'))
    for file_count, case_xml_path in tqdm(enumerate(case_xml_paths)):
        pq = parse_file(case_xml_path)
        sentences = nltk.sent_tokenize(get_case_text(pq))
        for sentence in sentences:
            sentence_count += 1

            # make sure we only check each sentence once
            sentence_hash = CityHash64(sentence)
            if sentence_hash in sentence_lookup:
                continue
            sentence_lookup.add(sentence_hash)

            tokens = list(tokenize_text(sentence))
            syllables = []
            for token in tokens:
                token = token.lower()
                if token not in pronunciations:
                    break
                syllables += pronunciations[token][0]

            else:
                # no unknown words found -- we can continue
                emphasis = u''.join(s for s in u"".join(syllables) if s.isdigit())
                line_type = None
                m = long_line_re.match(emphasis)
                if m:
                    line_type = 'long'
                else:
                    m = short_line_re.match(emphasis)
                    if m:
                        line_type = 'short'

                if line_type:
                    emphasis_pattern = u"1".join("*" * len(g) for g in m.groups())
                    if line_type == 'short':
                        emphasis_pattern += u'1'
                    last_token = tokens[-1].lower()
                    last_syllable = None
                    for i in reversed(range(len(syllables))):
                        if syllables[i][-1].isdigit():
                            last_syllable = u"".join(syllables[i:])
                            break

                    if last_syllable:
                        lines[line_type][emphasis_pattern][last_syllable][last_token].append(sentence.encode('utf8'))

        if not (file_count % 1000):
            print "Writing results so far."
            write_lines()

    write_lines()
Esempio n. 11
0
import helpers

if __name__ == '__main__':
    pw_n_pol = helpers.parse_file('./resources/passwords.txt',
                                  helpers.PositionPolicy)
    num_valid = sum(1 for (pw, policy) in pw_n_pol if policy.validate(pw))
    print(len(pw_n_pol))
    print(num_valid)
Esempio n. 12
0
def get_data(filename):
    pq = parse_file(filename)
    jurisdiction = pq("case|court").attr('jurisdiction').strip()
    return filename, get_decision_date(pq), jurisdiction, get_case_text(pq)