Example #1
0
def gather_eccodata(doc_id, target_path, ecco_source_dict, force_fetch=False):
    txt_target_path = target_path + "/" + str(doc_id) + "/fulltext"
    xml_target_path = target_path + "/" + str(doc_id) + "/xml"
    pagetxt_target_path = target_path + "/" + str(doc_id) + "/pagetexts"
    img_target_path = target_path + "/" + str(doc_id) + "/img"
    # create dirs if missing
    for path in [
            txt_target_path, xml_target_path, pagetxt_target_path,
            img_target_path
    ]:
        create_dir_if_not_exists(path)
    # get fulltxt file
    if path_is_empty(txt_target_path) or force_fetch:
        source_path = ecco_source_dict[doc_id]['path']
        sourcefiles = glob.glob(source_path + "/*.txt")
        for sourcefile in sourcefiles:
            copy2(sourcefile, (txt_target_path + "/"))
    # get pages from pouta with scp
    if path_is_empty(pagetxt_target_path) or force_fetch:
        source_path = ecco_source_dict[doc_id]['pouta_pages']
        os.system("scp -i ../../comhis.pem " + source_path + "/* " +
                  pagetxt_target_path + "/.")
    # get xml from pouta with scp
    if path_is_empty(xml_target_path) or force_fetch:
        source_path = ecco_source_dict[doc_id]['pouta_xml']
        os.system("scp -i ../../comhis.pem " + source_path + "/* " +
                  xml_target_path + "/.")
Example #2
0
def write_cluster_list_results_csv(cluster_list,
                                   outpath_prefix,
                                   include_date=False):

    print("> Writing cluster list as csv ...")
    group_ids = set()
    for cluster in cluster_list:
        group_ids.add(cluster.group_id)

    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)

    outdir = "output/" + outpath_prefix + "/by_header/"
    create_dir_if_not_exists(outdir)

    for group_id in group_ids:
        outfile = outdir + str(group_id) + ".csv"
        with open(outfile, 'w') as output_file:
            csvwriter = csv.writer(output_file)
            csvwriter.writerow([
                'cluster_id', 'ecco_id', 'estc_id', 'author', 'political_view',
                'title', 'preceding_header', 'year', 'guessed_first_ed_year',
                'location', 'text_before', 'text', 'text_after',
                'preceding_header_index', 'start_index', 'end_index',
                'document_length', 'document_collection', 'group_name',
                'group_id', 'group_start_index', 'group_end_index'
            ])
        for cluster in cluster_list:
            if cluster.group_id == group_id:
                cluster.write_cluster_csv(outfile,
                                          include_header_row=False,
                                          method='a')
    print("  >> Done!")
Example #3
0
def write_plotdata_countries_csv(plotdata_countries,
                                 outpath_prefix,
                                 include_date=True):
    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)
    outdir = "output/" + outpath_prefix + "/"
    create_dir_if_not_exists(outdir)

    fieldnames = [
        'index', 'header', 'USA', 'England', 'Scotland', 'Ireland', 'Others'
    ]

    output_csvfile = (outdir + "plotdata_countries.csv")
    with open(output_csvfile, 'w') as csvfile:
        csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
        csvwriter.writeheader()
        for row in plotdata_countries:
            csvwriter.writerow(row)

    output_csvsummary = (outdir + "plotdata_countries_sum.csv")
    with open(output_csvsummary, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(fieldnames[2:])
        usa = 0
        england = 0
        scotland = 0
        ireland = 0
        others = 0
        for row in plotdata_countries:
            usa += row.get('USA')
            england += row.get('England')
            scotland += row.get('Scotland')
            ireland += row.get('Ireland')
            others += row.get('Others')
        csvwriter.writerow([usa, england, scotland, ireland, others])
Example #4
0
def write_header_summarydata_csv(header_summarydata,
                                 outpath_prefix,
                                 outfile_suffix="",
                                 include_date=False):

    print("> Writing header summary data as csv ...")

    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)

    outdir = "output/" + outpath_prefix + "/"

    create_dir_if_not_exists(outdir)
    output_csvfile = (outdir + "header_summary" + outfile_suffix + ".csv")

    with open(output_csvfile, 'w') as output_file:
        csvwriter = csv.writer(output_file)
        csvwriter.writerow([
            'header_index', 'header_text', 'total_fragments', 'unique_authors',
            'authors', 'unique_titles', 'titles'
        ])
        for row in header_summarydata:
            csvwriter.writerow([
                row.get('header_index'),
                row.get('header_text'),
                row.get('total_fragments'),
                row.get('unique_authors'),
                row.get('authors'),
                row.get('unique_titles'),
                row.get('titles')
            ])
Example #5
0
def write_header_plotdata_csv(header_plotdata,
                              outpath_prefix,
                              include_date=True):
    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)
    outdir = "output/" + outpath_prefix + "/"
    create_dir_if_not_exists(outdir)
    outputfile = outdir + "header_plotdata.csv"
    with open(outputfile, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        indices = header_plotdata[0].get('indices')
        header_texts = header_plotdata[0].get('headers')
        limiters = []
        for item in header_plotdata:
            limiters.append("within_" + str(item.get('within')))
        headerrow = ['header_index', 'header_text']
        headerrow.extend(limiters)
        csvwriter.writerow(headerrow)
        for i in range(0, len(indices)):
            header_index_hits = []
            for item in header_plotdata:
                header_index_hits.append(item.get('hits')[i])
            resultrow = [indices[i], header_texts[i]]
            resultrow.extend(header_index_hits)
            csvwriter.writerow(resultrow)
Example #6
0
def write_csv_total_fragments_per_author(plotdata,
                                         author_metadata,
                                         outpath_prefix,
                                         include_date=True):
    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)
    outdir = "output/" + outpath_prefix + "/"
    create_dir_if_not_exists(outdir)
    outputfile = outdir + "fragments_per_author.csv"
    with open(outputfile, 'w') as output_csv:
        headerrow = ["author", "total_fragments", "political_views", "link"]
        headerrow.extend(plotdata[0].get('header_texts'))
        csvwriter = csv.writer(output_csv)
        csvwriter.writerow(headerrow)
        for entry in plotdata:
            author = entry.get('author')
            total = entry.get('total')
            views = "no_record"
            link = "no_record"
            if author in author_metadata.keys():
                a_meta = author_metadata[author]
                views = a_meta.get("political_views")
                link = a_meta.get("odnb_link")
            outrow = [author, total, views, link]
            outrow.extend(entry.get('header_hits'))
            csvwriter.writerow(outrow)
Example #7
0
def write_plotdata_politics_csv(plotdata_politics,
                                outpath_prefix,
                                include_date=True):
    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)
    outdir = "output/" + outpath_prefix + "/"
    create_dir_if_not_exists(outdir)

    fieldnames = [
        'index', 'header', 'whig', 'royalist', 'jacobite', 'parliamentarian',
        'tory', 'unionist', 'no_record', 'whig_wide', 'tory_wide',
        'others_wide'
    ]

    output_csvfile = (outdir + "plotdata_political_views.csv")
    with open(output_csvfile, 'w') as csvfile:
        csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
        csvwriter.writeheader()
        for row in plotdata_politics:
            csvwriter.writerow(row)

    output_csvsummary = (outdir + "plotdata_politics_sum.csv")
    with open(output_csvsummary, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(fieldnames[2:])
        whig = 0
        royalist = 0
        jacobite = 0
        parliamentarian = 0
        tory = 0
        unionist = 0
        no_record = 0
        whig_wide = 0
        tory_wide = 0
        others_wide = 0
        for row in plotdata_politics:
            whig += row.get('whig')
            royalist += row.get('royalist')
            jacobite += row.get('jacobite')
            parliamentarian += row.get('parliamentarian')
            tory += row.get('tory')
            unionist += row.get('unionist')
            no_record += row.get('no_record')
            whig_wide += row.get('whig_wide')
            tory_wide += row.get('tory_wide')
            others_wide += row.get('others_wide')
        csvwriter.writerow([
            whig, royalist, jacobite, parliamentarian, tory, unionist,
            no_record, whig_wide, tory_wide, others_wide
        ])
Example #8
0
def write_document_text_with_coverage_highlight(document_text,
                                                outpath_prefix,
                                                include_date=False):

    print("> Writing document text with coverage highlights ...")

    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)

    outdir = "output/" + outpath_prefix + "/"

    create_dir_if_not_exists(outdir)
    output_txtfile = (outdir + "coverage_highlight" + ".txt")

    with open(output_txtfile, 'w') as output_file:
        output_file.write(document_text)
Example #9
0
def write_cluster_coverage_as_csv(coverage_data,
                                  outpath_prefix,
                                  include_date=False):

    print("> Writing cluster coverage as csv ...")

    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)

    outdir = "output/" + outpath_prefix + "/"

    create_dir_if_not_exists(outdir)
    output_csvfile = outdir + "cluster_coverage.csv"
    with open(output_csvfile, 'w') as coverage_file:
        csvwriter = csv.writer(coverage_file)
        csvwriter.writerow(['Coverage'])
        for row in coverage_data:
            csvwriter.writerow([row])
Example #10
0
def save_plotdata_csv(plotdata,
                      xlabel,
                      ylabel,
                      outpath_prefix,
                      include_date=False):
    xdata = plotdata.get('x')
    ydata = plotdata.get('y')
    data_size = len(xdata)

    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)

    outdir = "output/" + outpath_prefix + "/"

    create_dir_if_not_exists(outdir)
    output_csvfile = (outdir + "plotdata_" + xlabel + "-" + ylabel + ".csv")
    with open(output_csvfile, 'w') as coverage_file:
        csvwriter = csv.writer(coverage_file)
        csvwriter.writerow([xlabel, ylabel])
        for i in range(0, data_size):
            csvwriter.writerow([xdata[i], ydata[i]])
Example #11
0
def write_csv_total_fragments_per_author_per_year(plotdata,
                                                  outpath_prefix,
                                                  include_date=True):
    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)
    outdir = "output/" + outpath_prefix + "/"
    create_dir_if_not_exists(outdir)
    topx = len(plotdata)
    outputfile = (outdir + "fragments_per_author_per_decade_top" + str(topx) +
                  ".csv")
    with open(outputfile, 'w') as output_csv:
        headerrow = ["decade"]
        authors = []
        for entry in plotdata:
            authors.append(entry.get('author'))
        headerrow.extend(authors)
        csvwriter = csv.writer(output_csv)
        csvwriter.writerow(headerrow)
        decades = plotdata[0].get('decades')
        for i in range(0, len(decades)):
            datarow = [plotdata[0].get('decades')[i]]
            for entry in plotdata:
                datarow.append(entry.get('decade_fragments')[i])
            csvwriter.writerow(datarow)
Example #12
0
def get_default_logfile(logdir="./logs"):
    # logdir = "./logs"
    create_dir_if_not_exists(logdir)
    logfile = logdir + "/log.txt"
    return logfile
Example #13
0
        opts, args = getopt.getopt(argv, "", ["inputfile="])
    except getopt.GetoptError:

        sys.exit(2)
    for opt, arg in opts:
        if opt == "--inputfile":
            inputfile = arg
    print("inputfile: " + inputfile)
    return inputfile


# read input parameters
inputfile = get_start_params(sys.argv[1:])
outputprefix = inputfile[:-4]
outputpath = '../../output/work/octavo_indices/' + outputprefix + '/'
create_dir_if_not_exists(outputpath)

# setup api clients
ecco_api_client = OctavoEccoClient()
cluster_api_client = OctavoEccoClusterClient(timeout=60)

fields_ecco = ["documentID", "content"]
field_eccocluster = [
    "documentID", "fragmentID", "text", "startIndex", "endIndex"
]

# read ids to process
ids_to_process = set()
add_csv_ids_to_set('cfg/indexdata_dump/' + inputfile, ids_to_process)

# prepare log file and list of processed ids
Example #14
0
# docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv')
xml_img_page_datadir = ("../data/raw/ecco-xml-img/")

fields_ecco = ["documentID", "content"]
field_eccocluster = [
    "documentID", "fragmentID", "text", "startIndex", "endIndex"
]

datadir = get_datadir() + "/"

# reuse data list of JSON files
reuse_data_dir = "../output/" + datadir
add_cludata = read_reuse_data_dir(reuse_data_dir)
outdir = "../output/pdfs/" + datadir
create_dir_if_not_exists(outdir)
temp_workdir = "../data/work/img/" + datadir
create_dir_if_not_exists(temp_workdir)

# Hume, History of England
docids_first = [
    # "1729100401",
    # "1729200102",
    # "1729200103",
    # "1729300104",
    # "1729400105",
    "1729400106",
    "1729500107",
    # "1729500108",
]
Example #15
0
def create_csv_summaries(outputpath, documents_meta_dict):

    # def get_filenamesums(filename):
    create_dir_if_not_exists(outputpath)
    current_subdirs = glob(outputpath + "*/")
    filenames = []

    for pathpart in current_subdirs:
        filenames.append(pathpart + "header_plotdata.csv")

    outfilename = outputpath + "volume_totals.csv"

    with open(outfilename, 'w') as csvoutfile:
        csvwriter = csv.writer(csvoutfile)
        csvwriter.writerow(
            ['volume', 'fragments', 'length', 'description', 'sequence'])

    for filename in filenames:
        book_index = filename.split('/')[2]
        add_filename_to_totals(outfilename, filename, book_index,
                               documents_meta_dict)

    # get author totals
    author_metadata = read_author_metadata_csv(
        "../data-public/authors-metadata/misc/author_metadata.csv")
    author_filenames = []
    for pathpart in current_subdirs:
        author_filenames.append(pathpart + "fragments_per_author.csv")

    outfilename = outputpath + "author_totals.csv"

    author_totals = {}

    for filename in author_filenames:
        add_to_author_totals(author_totals, filename)

    with open(outfilename, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['author', 'fragments', 'political_views', 'link'])
        for key, value in author_totals.items():
            if key in author_metadata.keys():
                author_politics = author_metadata.get(key).get(
                    'political_views')
                author_link = author_metadata.get(key).get('link')
            else:
                author_politics = "no_record"
                author_link = "no_record"
            csvwriter.writerow([key, value, author_politics, author_link])

    header_filenames = glob(outputpath + "*/*/*.csv")
    header_author_totals_outfile = (outputpath +
                                    "reuses_by_author_all_headers.csv")

    with open(header_author_totals_outfile, 'w') as csvout:
        csvwriter = csv.writer(csvout)
        csvwriter.writerow(
            ['author', 'eccoid', 'header', 'header_index', 'fragments'])

    for filename in header_filenames:
        bookid = filename.split('/')[2]

        with open(filename, 'r') as csvfile:
            csvreader = csv.DictReader(csvfile)
            # next(csvreader, None)  # skip the headers
            author_totals = {}
            for row in csvreader:
                author = row.get('author')
                # title = row.get('title')
                group_name = row.get('group_name')
                group_id = row.get('group_id')
                if author not in author_totals.keys():
                    author_totals[author] = 1
                else:
                    author_totals[author] += 1

            with open(header_author_totals_outfile, 'a') as csvout:
                csvwriter = csv.writer(csvout)
                for key, value in author_totals.items():
                    csvwriter.writerow(
                        [key, bookid, group_name, group_id, value])

    # sum political views
    pol_filenames = []
    for pathpart in current_subdirs:
        pol_filenames.append(pathpart + "plotdata_politics_sum.csv")

    outfilename = outputpath + "politics_summary.csv"

    with open(outfilename, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow([
            'volume', 'content', 'sequence', 'whig', 'royalist', 'jacobite',
            'parliamentarian', 'tory', 'unionist', 'no_record', 'whig_wide',
            'tory_wide', 'others_wide'
        ])

        for filename in pol_filenames:
            bookid = filename.split('/')[2]
            book_meta = documents_meta_dict[bookid]
            with open(filename, 'r') as readfile:
                csvreader = csv.DictReader(readfile)
                for row in csvreader:
                    csvwriter.writerow([
                        bookid,
                        book_meta.get('description'),
                        book_meta.get('sequence'),
                        row.get('whig'),
                        row.get('royalist'),
                        row.get('jacobite'),
                        row.get('parliamentarian'),
                        row.get('tory'),
                        row.get('unionist'),
                        row.get('no_record'),
                        row.get('whig_wide'),
                        row.get('tory_wide'),
                        row.get('others_wide')
                    ])
Example #16
0
def write_document_html_with_coverage_highlight(coverage_data,
                                                document_text,
                                                outpath_prefix,
                                                include_date=False):

    print("> Writing document html with coverage highlights ...")
    document_text_list = list(document_text)
    document_length = len(document_text_list)
    html_text_list = []

    html_text_list.append(get_html_start(add_header=True))
    html_text_list.append("<p>")

    char_index = 0
    prev_char_in_cluster = False
    mark_tag_open = False
    header_tag_open = False
    mark_tag_reopen = False

    while char_index < document_length:
        if coverage_data[char_index] == 0:
            this_char_in_cluster = False
        else:
            this_char_in_cluster = True

        if (this_char_in_cluster) and (not prev_char_in_cluster):
            html_text_list.append("<mark>")
            mark_tag_open = True
        elif (not this_char_in_cluster) and (prev_char_in_cluster):
            html_text_list.append("</mark>")
            mark_tag_open = False

        if (document_text_list[char_index] == "\n"
                and document_text_list[char_index + 1] == "\n"
                and document_text_list[char_index + 2] == "#"):
            header_tag_open = True
            if mark_tag_open:
                html_text_list.append("</mark></p>\n<p><h1>")
                mark_tag_reopen = True
            else:
                html_text_list.append("</p>\n<p><h1>")
            char_to_append = ""
            char_index_step = 3
        elif (document_text_list[char_index] == "\n"
              and document_text_list[char_index + 1] == "\n"):
            if mark_tag_open:
                html_text_list.append("</mark>")
                mark_tag_reopen = True
            if header_tag_open:
                html_text_list.append("</h1>")
                header_tag_open = False
            html_text_list.append("</p>")
            char_to_append = "\n<p>"
            char_index_step = 2
        elif document_text_list[char_index] == "\n":
            char_to_append = " "
            char_index_step = 1
        else:
            char_to_append = cgi.escape(document_text_list[char_index])
            char_index_step = 1

        html_text_list.append(char_to_append)

        if mark_tag_reopen:
            html_text_list.append("<mark>")
            mark_tag_reopen = False
        prev_char_in_cluster = this_char_in_cluster
        char_index += char_index_step

    if mark_tag_open:
        html_text_list.append("</mark>")
    html_text_list.append("</p>")
    html_text_list.append(get_html_end())
    html_text_results = ''.join(html_text_list)

    if include_date:
        outpath_prefix = get_outpath_prefix_with_date(outpath_prefix)
    outdir = "output/" + outpath_prefix + "/"
    create_dir_if_not_exists(outdir)

    output_htmlfile = (outdir + "coverage_highlight.html")
    with open(output_htmlfile, 'w') as output_file:
        output_file.write(html_text_results)