def gather_eccodata(doc_id, target_path, ecco_source_dict, force_fetch=False): txt_target_path = target_path + "/" + str(doc_id) + "/fulltext" xml_target_path = target_path + "/" + str(doc_id) + "/xml" pagetxt_target_path = target_path + "/" + str(doc_id) + "/pagetexts" img_target_path = target_path + "/" + str(doc_id) + "/img" # create dirs if missing for path in [ txt_target_path, xml_target_path, pagetxt_target_path, img_target_path ]: create_dir_if_not_exists(path) # get fulltxt file if path_is_empty(txt_target_path) or force_fetch: source_path = ecco_source_dict[doc_id]['path'] sourcefiles = glob.glob(source_path + "/*.txt") for sourcefile in sourcefiles: copy2(sourcefile, (txt_target_path + "/")) # get pages from pouta with scp if path_is_empty(pagetxt_target_path) or force_fetch: source_path = ecco_source_dict[doc_id]['pouta_pages'] os.system("scp -i ../../comhis.pem " + source_path + "/* " + pagetxt_target_path + "/.") # get xml from pouta with scp if path_is_empty(xml_target_path) or force_fetch: source_path = ecco_source_dict[doc_id]['pouta_xml'] os.system("scp -i ../../comhis.pem " + source_path + "/* " + xml_target_path + "/.")
def write_cluster_list_results_csv(cluster_list, outpath_prefix, include_date=False): print("> Writing cluster list as csv ...") group_ids = set() for cluster in cluster_list: group_ids.add(cluster.group_id) if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/by_header/" create_dir_if_not_exists(outdir) for group_id in group_ids: outfile = outdir + str(group_id) + ".csv" with open(outfile, 'w') as output_file: csvwriter = csv.writer(output_file) csvwriter.writerow([ 'cluster_id', 'ecco_id', 'estc_id', 'author', 'political_view', 'title', 'preceding_header', 'year', 'guessed_first_ed_year', 'location', 'text_before', 'text', 'text_after', 'preceding_header_index', 'start_index', 'end_index', 'document_length', 'document_collection', 'group_name', 'group_id', 'group_start_index', 'group_end_index' ]) for cluster in cluster_list: if cluster.group_id == group_id: cluster.write_cluster_csv(outfile, include_header_row=False, method='a') print(" >> Done!")
def write_plotdata_countries_csv(plotdata_countries, outpath_prefix, include_date=True): if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) fieldnames = [ 'index', 'header', 'USA', 'England', 'Scotland', 'Ireland', 'Others' ] output_csvfile = (outdir + "plotdata_countries.csv") with open(output_csvfile, 'w') as csvfile: csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) csvwriter.writeheader() for row in plotdata_countries: csvwriter.writerow(row) output_csvsummary = (outdir + "plotdata_countries_sum.csv") with open(output_csvsummary, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fieldnames[2:]) usa = 0 england = 0 scotland = 0 ireland = 0 others = 0 for row in plotdata_countries: usa += row.get('USA') england += row.get('England') scotland += row.get('Scotland') ireland += row.get('Ireland') others += row.get('Others') csvwriter.writerow([usa, england, scotland, ireland, others])
def write_header_summarydata_csv(header_summarydata, outpath_prefix, outfile_suffix="", include_date=False): print("> Writing header summary data as csv ...") if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) output_csvfile = (outdir + "header_summary" + outfile_suffix + ".csv") with open(output_csvfile, 'w') as output_file: csvwriter = csv.writer(output_file) csvwriter.writerow([ 'header_index', 'header_text', 'total_fragments', 'unique_authors', 'authors', 'unique_titles', 'titles' ]) for row in header_summarydata: csvwriter.writerow([ row.get('header_index'), row.get('header_text'), row.get('total_fragments'), row.get('unique_authors'), row.get('authors'), row.get('unique_titles'), row.get('titles') ])
def write_header_plotdata_csv(header_plotdata, outpath_prefix, include_date=True): if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) outputfile = outdir + "header_plotdata.csv" with open(outputfile, 'w') as csvfile: csvwriter = csv.writer(csvfile) indices = header_plotdata[0].get('indices') header_texts = header_plotdata[0].get('headers') limiters = [] for item in header_plotdata: limiters.append("within_" + str(item.get('within'))) headerrow = ['header_index', 'header_text'] headerrow.extend(limiters) csvwriter.writerow(headerrow) for i in range(0, len(indices)): header_index_hits = [] for item in header_plotdata: header_index_hits.append(item.get('hits')[i]) resultrow = [indices[i], header_texts[i]] resultrow.extend(header_index_hits) csvwriter.writerow(resultrow)
def write_csv_total_fragments_per_author(plotdata, author_metadata, outpath_prefix, include_date=True): if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) outputfile = outdir + "fragments_per_author.csv" with open(outputfile, 'w') as output_csv: headerrow = ["author", "total_fragments", "political_views", "link"] headerrow.extend(plotdata[0].get('header_texts')) csvwriter = csv.writer(output_csv) csvwriter.writerow(headerrow) for entry in plotdata: author = entry.get('author') total = entry.get('total') views = "no_record" link = "no_record" if author in author_metadata.keys(): a_meta = author_metadata[author] views = a_meta.get("political_views") link = a_meta.get("odnb_link") outrow = [author, total, views, link] outrow.extend(entry.get('header_hits')) csvwriter.writerow(outrow)
def write_plotdata_politics_csv(plotdata_politics, outpath_prefix, include_date=True): if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) fieldnames = [ 'index', 'header', 'whig', 'royalist', 'jacobite', 'parliamentarian', 'tory', 'unionist', 'no_record', 'whig_wide', 'tory_wide', 'others_wide' ] output_csvfile = (outdir + "plotdata_political_views.csv") with open(output_csvfile, 'w') as csvfile: csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) csvwriter.writeheader() for row in plotdata_politics: csvwriter.writerow(row) output_csvsummary = (outdir + "plotdata_politics_sum.csv") with open(output_csvsummary, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fieldnames[2:]) whig = 0 royalist = 0 jacobite = 0 parliamentarian = 0 tory = 0 unionist = 0 no_record = 0 whig_wide = 0 tory_wide = 0 others_wide = 0 for row in plotdata_politics: whig += row.get('whig') royalist += row.get('royalist') jacobite += row.get('jacobite') parliamentarian += row.get('parliamentarian') tory += row.get('tory') unionist += row.get('unionist') no_record += row.get('no_record') whig_wide += row.get('whig_wide') tory_wide += row.get('tory_wide') others_wide += row.get('others_wide') csvwriter.writerow([ whig, royalist, jacobite, parliamentarian, tory, unionist, no_record, whig_wide, tory_wide, others_wide ])
def write_document_text_with_coverage_highlight(document_text, outpath_prefix, include_date=False): print("> Writing document text with coverage highlights ...") if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) output_txtfile = (outdir + "coverage_highlight" + ".txt") with open(output_txtfile, 'w') as output_file: output_file.write(document_text)
def write_cluster_coverage_as_csv(coverage_data, outpath_prefix, include_date=False): print("> Writing cluster coverage as csv ...") if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) output_csvfile = outdir + "cluster_coverage.csv" with open(output_csvfile, 'w') as coverage_file: csvwriter = csv.writer(coverage_file) csvwriter.writerow(['Coverage']) for row in coverage_data: csvwriter.writerow([row])
def save_plotdata_csv(plotdata, xlabel, ylabel, outpath_prefix, include_date=False): xdata = plotdata.get('x') ydata = plotdata.get('y') data_size = len(xdata) if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) output_csvfile = (outdir + "plotdata_" + xlabel + "-" + ylabel + ".csv") with open(output_csvfile, 'w') as coverage_file: csvwriter = csv.writer(coverage_file) csvwriter.writerow([xlabel, ylabel]) for i in range(0, data_size): csvwriter.writerow([xdata[i], ydata[i]])
def write_csv_total_fragments_per_author_per_year(plotdata, outpath_prefix, include_date=True): if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) topx = len(plotdata) outputfile = (outdir + "fragments_per_author_per_decade_top" + str(topx) + ".csv") with open(outputfile, 'w') as output_csv: headerrow = ["decade"] authors = [] for entry in plotdata: authors.append(entry.get('author')) headerrow.extend(authors) csvwriter = csv.writer(output_csv) csvwriter.writerow(headerrow) decades = plotdata[0].get('decades') for i in range(0, len(decades)): datarow = [plotdata[0].get('decades')[i]] for entry in plotdata: datarow.append(entry.get('decade_fragments')[i]) csvwriter.writerow(datarow)
def get_default_logfile(logdir="./logs"): # logdir = "./logs" create_dir_if_not_exists(logdir) logfile = logdir + "/log.txt" return logfile
opts, args = getopt.getopt(argv, "", ["inputfile="]) except getopt.GetoptError: sys.exit(2) for opt, arg in opts: if opt == "--inputfile": inputfile = arg print("inputfile: " + inputfile) return inputfile # read input parameters inputfile = get_start_params(sys.argv[1:]) outputprefix = inputfile[:-4] outputpath = '../../output/work/octavo_indices/' + outputprefix + '/' create_dir_if_not_exists(outputpath) # setup api clients ecco_api_client = OctavoEccoClient() cluster_api_client = OctavoEccoClusterClient(timeout=60) fields_ecco = ["documentID", "content"] field_eccocluster = [ "documentID", "fragmentID", "text", "startIndex", "endIndex" ] # read ids to process ids_to_process = set() add_csv_ids_to_set('cfg/indexdata_dump/' + inputfile, ids_to_process) # prepare log file and list of processed ids
# docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv') xml_img_page_datadir = ("../data/raw/ecco-xml-img/") fields_ecco = ["documentID", "content"] field_eccocluster = [ "documentID", "fragmentID", "text", "startIndex", "endIndex" ] datadir = get_datadir() + "/" # reuse data list of JSON files reuse_data_dir = "../output/" + datadir add_cludata = read_reuse_data_dir(reuse_data_dir) outdir = "../output/pdfs/" + datadir create_dir_if_not_exists(outdir) temp_workdir = "../data/work/img/" + datadir create_dir_if_not_exists(temp_workdir) # Hume, History of England docids_first = [ # "1729100401", # "1729200102", # "1729200103", # "1729300104", # "1729400105", "1729400106", "1729500107", # "1729500108", ]
def create_csv_summaries(outputpath, documents_meta_dict): # def get_filenamesums(filename): create_dir_if_not_exists(outputpath) current_subdirs = glob(outputpath + "*/") filenames = [] for pathpart in current_subdirs: filenames.append(pathpart + "header_plotdata.csv") outfilename = outputpath + "volume_totals.csv" with open(outfilename, 'w') as csvoutfile: csvwriter = csv.writer(csvoutfile) csvwriter.writerow( ['volume', 'fragments', 'length', 'description', 'sequence']) for filename in filenames: book_index = filename.split('/')[2] add_filename_to_totals(outfilename, filename, book_index, documents_meta_dict) # get author totals author_metadata = read_author_metadata_csv( "../data-public/authors-metadata/misc/author_metadata.csv") author_filenames = [] for pathpart in current_subdirs: author_filenames.append(pathpart + "fragments_per_author.csv") outfilename = outputpath + "author_totals.csv" author_totals = {} for filename in author_filenames: add_to_author_totals(author_totals, filename) with open(outfilename, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(['author', 'fragments', 'political_views', 'link']) for key, value in author_totals.items(): if key in author_metadata.keys(): author_politics = author_metadata.get(key).get( 'political_views') author_link = author_metadata.get(key).get('link') else: author_politics = "no_record" author_link = "no_record" csvwriter.writerow([key, value, author_politics, author_link]) header_filenames = glob(outputpath + "*/*/*.csv") header_author_totals_outfile = (outputpath + "reuses_by_author_all_headers.csv") with open(header_author_totals_outfile, 'w') as csvout: csvwriter = csv.writer(csvout) csvwriter.writerow( ['author', 'eccoid', 'header', 'header_index', 'fragments']) for filename in header_filenames: bookid = filename.split('/')[2] with open(filename, 'r') as csvfile: csvreader = csv.DictReader(csvfile) # next(csvreader, None) # skip the headers author_totals = {} for row in csvreader: author = row.get('author') # title = row.get('title') group_name = row.get('group_name') group_id = row.get('group_id') if author not in author_totals.keys(): author_totals[author] = 1 else: author_totals[author] += 1 with open(header_author_totals_outfile, 'a') as csvout: csvwriter = csv.writer(csvout) for key, value in author_totals.items(): csvwriter.writerow( [key, bookid, group_name, group_id, value]) # sum political views pol_filenames = [] for pathpart in current_subdirs: pol_filenames.append(pathpart + "plotdata_politics_sum.csv") outfilename = outputpath + "politics_summary.csv" with open(outfilename, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow([ 'volume', 'content', 'sequence', 'whig', 'royalist', 'jacobite', 'parliamentarian', 'tory', 'unionist', 'no_record', 'whig_wide', 'tory_wide', 'others_wide' ]) for filename in pol_filenames: bookid = filename.split('/')[2] book_meta = documents_meta_dict[bookid] with open(filename, 'r') as readfile: csvreader = csv.DictReader(readfile) for row in csvreader: csvwriter.writerow([ bookid, book_meta.get('description'), book_meta.get('sequence'), row.get('whig'), row.get('royalist'), row.get('jacobite'), row.get('parliamentarian'), row.get('tory'), row.get('unionist'), row.get('no_record'), row.get('whig_wide'), row.get('tory_wide'), row.get('others_wide') ])
def write_document_html_with_coverage_highlight(coverage_data, document_text, outpath_prefix, include_date=False): print("> Writing document html with coverage highlights ...") document_text_list = list(document_text) document_length = len(document_text_list) html_text_list = [] html_text_list.append(get_html_start(add_header=True)) html_text_list.append("<p>") char_index = 0 prev_char_in_cluster = False mark_tag_open = False header_tag_open = False mark_tag_reopen = False while char_index < document_length: if coverage_data[char_index] == 0: this_char_in_cluster = False else: this_char_in_cluster = True if (this_char_in_cluster) and (not prev_char_in_cluster): html_text_list.append("<mark>") mark_tag_open = True elif (not this_char_in_cluster) and (prev_char_in_cluster): html_text_list.append("</mark>") mark_tag_open = False if (document_text_list[char_index] == "\n" and document_text_list[char_index + 1] == "\n" and document_text_list[char_index + 2] == "#"): header_tag_open = True if mark_tag_open: html_text_list.append("</mark></p>\n<p><h1>") mark_tag_reopen = True else: html_text_list.append("</p>\n<p><h1>") char_to_append = "" char_index_step = 3 elif (document_text_list[char_index] == "\n" and document_text_list[char_index + 1] == "\n"): if mark_tag_open: html_text_list.append("</mark>") mark_tag_reopen = True if header_tag_open: html_text_list.append("</h1>") header_tag_open = False html_text_list.append("</p>") char_to_append = "\n<p>" char_index_step = 2 elif document_text_list[char_index] == "\n": char_to_append = " " char_index_step = 1 else: char_to_append = cgi.escape(document_text_list[char_index]) char_index_step = 1 html_text_list.append(char_to_append) if mark_tag_reopen: html_text_list.append("<mark>") mark_tag_reopen = False prev_char_in_cluster = this_char_in_cluster char_index += char_index_step if mark_tag_open: html_text_list.append("</mark>") html_text_list.append("</p>") html_text_list.append(get_html_end()) html_text_results = ''.join(html_text_list) if include_date: outpath_prefix = get_outpath_prefix_with_date(outpath_prefix) outdir = "output/" + outpath_prefix + "/" create_dir_if_not_exists(outdir) output_htmlfile = (outdir + "coverage_highlight.html") with open(output_htmlfile, 'w') as output_file: output_file.write(html_text_results)