def poll(): pop_messages = pop_feed( settings.POLL_USERNAME, settings.POLL_PASSWORD, settings.POLL_HOST ) #pop_messages = disk_feed() messages = grab_messages(pop_messages) for message in messages: try: print "Processing messing" for (filename, original_filename) in message["files"]: parse_file(filename) except Exception, e: print e
def ngram_volume(volume_path): counts = defaultdict( # year lambda: defaultdict( # ngram_len lambda: defaultdict( # ngram int # count ))) for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) tokens = tokenize_text(get_case_text(pq)) history = [] case_year = get_decision_date(pq).year for i, item in enumerate(tokens): history.append(item) for ngram_len in [1,2,3]: if len(history) >= ngram_len: counts[case_year][ngram_len]["\t".join(history[-ngram_len:])] += 1 if i >= 2: del history[0] for year, ngram_lens in counts.iteritems(): out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1), str(year)) makedirs(out_dir) for ngram_len, data in ngram_lens.items(): with open(os.path.join(out_dir, "%s.json" % ngram_len), 'w') as out: json.dump(data, out)
def search_volumes(): makedirs(dest_dir) for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))): series_name = os.path.basename(series_path) known_series = defaultdict( lambda: { 'count': 0, 'examples': [] } ) for volume_path in sorted(glob(os.path.join(series_path, "*"))): for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) text = get_case_text(pq) cites = cite_match.findall(text) for series in cites: ks = known_series[series[1]] ks['count'] += 1 if len(ks['examples']) < 3: ks['examples'].append(" ".join(series)) # write to CSV out = [[k, v['count']]+v['examples'] for k, v in known_series.iteritems()] out.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(dest_dir, '%s.csv' % series_name), 'wb') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(['Series', 'Count', 'Example 1', 'Example 2', 'Example 3']) for row in out: csvwriter.writerow(row)
def ngram_volume(volume_path): counts = defaultdict( # year lambda: defaultdict( # ngram_len lambda: defaultdict( # ngram int # count ))) for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) tokens = tokenize_text(get_case_text(pq)) history = [] case_year = get_decision_date(pq).year for i, item in enumerate(tokens): history.append(item) for ngram_len in [1, 2, 3]: if len(history) >= ngram_len: counts[case_year][ngram_len]["\t".join( history[-ngram_len:])] += 1 if i >= 2: del history[0] for year, ngram_lens in counts.iteritems(): out_dir = os.path.join(volume_path.replace(source_dir, dest_dir, 1), str(year)) makedirs(out_dir) for ngram_len, data in ngram_lens.items(): with open(os.path.join(out_dir, "%s.json" % ngram_len), 'w') as out: json.dump(data, out)
def search_volumes(): makedirs(dest_dir) for series_path in tqdm(sorted(glob(os.path.join(source_dir, "*/*")))): series_name = os.path.basename(series_path) known_series = defaultdict(lambda: {'count': 0, 'examples': []}) for volume_path in sorted(glob(os.path.join(series_path, "*"))): for case_xml_path in glob(os.path.join(volume_path, '*.xml')): pq = parse_file(case_xml_path) text = get_case_text(pq) cites = cite_match.findall(text) for series in cites: ks = known_series[series[1]] ks['count'] += 1 if len(ks['examples']) < 3: ks['examples'].append(" ".join(series)) # write to CSV out = [[k, v['count']] + v['examples'] for k, v in known_series.iteritems()] out.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(dest_dir, '%s.csv' % series_name), 'wb') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow( ['Series', 'Count', 'Example 1', 'Example 2', 'Example 3']) for row in out: csvwriter.writerow(row)
def tokenize_case(case_xml_path): out_path = case_xml_path.replace(source_dir, dest_dir, 1).replace('.xml', '.txt') if os.path.exists(out_path): return pq = parse_file(case_xml_path) case_text = get_case_text(pq) tokens = nltk.word_tokenize(case_text) makedirs(os.path.dirname(out_path)) with open(out_path, 'w') as out: out.write(u"\n".join(tokens).encode("utf8"))
def search_front_matter(): for jurisdiction_path in sorted(glob(os.path.join(source_dir, "*"))): makedirs(jurisdiction_path.replace(source_dir, dest_dir)) for series_path in glob(os.path.join(jurisdiction_path, "*")): print series_path try: out = u"" for volume_path in sorted(glob(os.path.join(series_path, "*")), key=lambda x: int(x.rsplit('/',1)[1])): # load first case in volume case_paths = sorted(glob(os.path.join(volume_path, "*.xml"))) if not case_paths: continue first_case_path = case_paths[0] pq = parse_file(first_case_path) # stop processing volume after 1923 year = get_decision_date(pq).year if year > 1923: break # get first alto file for first case first_case_alto_file = pq('METS|fileGrp[USE="alto"] METS|FLocat')[0].attrib[qn("xlink|href")][3:] first_case_alto_name = os.path.basename(first_case_alto_file) # get directory for alto files for volume case_id = pq("case|case").attr('caseid') alto_dir = os.path.dirname(os.path.join(raw_source_dir, case_id, first_case_alto_file)).replace('_0001', '_redacted') # process alto files until we hit the one for the first case in the volume for alto_path in sorted(glob(os.path.join(alto_dir, "*"))): if alto_path.endswith(first_case_alto_name): break # only bother parsing XML if we find 'reporter' in the text of the alto file somewhere alto_data = open(alto_path).read() if 'reporter' not in alto_data.lower(): continue alto_pq = PyQuery(alto_data, parser='xml', namespaces=namespaces) # extract OCR'd text from alto XML alto_text = " ".join(x.attrib["CONTENT"] for x in alto_pq('alto|String')) # if page has more than fifty lowercase words, less than 15 uppercase words (usually a list of judges), # and less than 30 periods (usually a table of contents), print citation and page text if len(re.findall(r'\b[a-z]+\b', alto_text))>50 and len(re.findall(r'\b[A-Z][A-Z]+\b', alto_text))<15 and len(re.findall(r'\.', alto_text))<30: volume_cite = pq('case|citation[category="official"]').text().rsplit(" ",1)[0] out += "%s\n%s\n%s\n\n" % (alto_path, volume_cite, alto_text) # write out all matched pages for series if out: open((series_path.replace(source_dir, dest_dir) + ".txt").replace('..', '.'), "w").write(out.encode('utf8')) except Exception as e: print "Skipping -- %s" % e
def sort_case(case_xml_path): pq = parse_file(case_xml_path) jurisdiction = get_jurisdiction(pq) citation = get_citation(pq) cite_parts = citation.split(" ") volume, reporter, page_number = cite_parts[0], " ".join(cite_parts[1:-1]), cite_parts[-1] volume_dir = os.path.join(jurisdiction, reporter, volume) makedirs(volume_dir) dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path)) if os.path.exists(dest_path): os.remove(dest_path) os.link(case_xml_path, dest_path)
def sort_case(case_xml_path): pq = parse_file(case_xml_path) jurisdiction = get_jurisdiction(pq) citation = get_citation(pq) cite_parts = citation.split(" ") volume, reporter, page_number = cite_parts[0], " ".join( cite_parts[1:-1]), cite_parts[-1] volume_dir = os.path.join(jurisdiction, reporter, volume) makedirs(volume_dir) dest_path = os.path.join(volume_dir, os.path.basename(case_xml_path)) if os.path.exists(dest_path): os.remove(dest_path) os.link(case_xml_path, dest_path)
def limerick_jurisdiction(jurisdiction_path): sentence_count = 0 lines = defaultdict( # line_type lambda: defaultdict( # emphasis_pattern lambda: defaultdict( # last_syllable lambda: defaultdict( # last_token list # sentence )))) sentence_lookup = set() def write_lines(): out_path = jurisdiction_path.replace(source_dir, dest_dir, 1).rstrip('/')+'.json' makedirs(os.path.dirname(out_path)) with open(out_path, 'w') as out: json.dump(lines, out) case_xml_paths = glob(os.path.join(jurisdiction_path, '*/*/*.xml')) for file_count, case_xml_path in tqdm(enumerate(case_xml_paths)): pq = parse_file(case_xml_path) sentences = nltk.sent_tokenize(get_case_text(pq)) for sentence in sentences: sentence_count += 1 # make sure we only check each sentence once sentence_hash = CityHash64(sentence) if sentence_hash in sentence_lookup: continue sentence_lookup.add(sentence_hash) tokens = list(tokenize_text(sentence)) syllables = [] for token in tokens: token = token.lower() if token not in pronunciations: break syllables += pronunciations[token][0] else: # no unknown words found -- we can continue emphasis = u''.join(s for s in u"".join(syllables) if s.isdigit()) line_type = None m = long_line_re.match(emphasis) if m: line_type = 'long' else: m = short_line_re.match(emphasis) if m: line_type = 'short' if line_type: emphasis_pattern = u"1".join("*" * len(g) for g in m.groups()) if line_type == 'short': emphasis_pattern += u'1' last_token = tokens[-1].lower() last_syllable = None for i in reversed(range(len(syllables))): if syllables[i][-1].isdigit(): last_syllable = u"".join(syllables[i:]) break if last_syllable: lines[line_type][emphasis_pattern][last_syllable][last_token].append(sentence.encode('utf8')) if not (file_count % 1000): print "Writing results so far." write_lines() write_lines()
import helpers if __name__ == '__main__': pw_n_pol = helpers.parse_file('./resources/passwords.txt', helpers.PositionPolicy) num_valid = sum(1 for (pw, policy) in pw_n_pol if policy.validate(pw)) print(len(pw_n_pol)) print(num_valid)
def get_data(filename): pq = parse_file(filename) jurisdiction = pq("case|court").attr('jurisdiction').strip() return filename, get_decision_date(pq), jurisdiction, get_case_text(pq)