Esempio n. 1
0
 def add_metadata(self, good_metadata, author_metadata):
     if self.ecco_id in good_metadata.keys():
         self.location = get_location_from_estc(
             self.ecco_id, good_metadata)
         self.country = get_country_from_estc(
             self.ecco_id, good_metadata)
         self.author = get_author_from_estc(
             self.ecco_id, good_metadata)
         self.estc_id = get_estcid_from_estc(
             self.ecco_id, good_metadata)
         self.year = get_year_from_estc(
             self.ecco_id, good_metadata)
         self.title = get_title_from_estc(
             self.ecco_id, good_metadata)
         bd = get_author_bd_from_estc(self.ecco_id, good_metadata)
         self.author_birth = bd.get('birth')
         self.author_death = bd.get('death')
         self.first_ed_year_guess = get_first_ed_year_guess_from_estc(
             self.ecco_id, good_metadata)
         if self.first_ed_year_guess is None:
             self.first_ed_year_guess = (
                 self.__get_guessed_first_ed_year())
         self.author_politics = (
             self.get_author_political_affiliation(author_metadata))
     else:
         print(str(self.ecco_id) + " not in good metadata.")
Esempio n. 2
0
def enrich_reuse_summary(reuse_summary, metadata):
    for row in reuse_summary:
        row['title'] = get_title_from_estc(row['document_id'], metadata)
        row['year'] = get_year_from_estc(row['document_id'], metadata)
        row['author'] = get_author_from_estc(row['document_id'], metadata)
        row['location'] = get_location_from_estc(row['document_id'], metadata)
        row['estcid'] = get_estcid_from_estc(row['document_id'], metadata)
Esempio n. 3
0
def get_cluster_data_for_document_id_from_api_filters(document_id,
                                                      good_metadata,
                                                      not_author=True,
                                                      originals_only=True,
                                                      years_min=-1000,
                                                      years_max=1000,
                                                      testing_amount=-1):

    # data = get_wide_cluster_data_for_document_id_from_api(document_id,
    #                                                       testing_amount)

    api_client = OctavoEccoClusterClient(limit=testing_amount)
    data = api_client.get_wide_cluster_data_for_document_id(document_id)

    print("Orig data length:" + str(len(data)))

    document_id = str(document_id)
    orig_author = get_author_from_estc(document_id, good_metadata)
    orig_year = get_year_from_estc(document_id, good_metadata)

    enriched_cluster_data = enrich_cluster_data(data, good_metadata)

    interesting_clusters = (get_cluster_ids_where_book_id_first(
        enriched_cluster_data, document_id))

    returndata = []

    for item in enriched_cluster_data:
        author = item.get('author')
        year = item.get('year')

        if not_author:
            if (orig_author == author
                    and item.get('document_id') != document_id):
                continue

        if originals_only:
            if (item.get('cluster_id') not in interesting_clusters):
                continue

        if years_max != 1000:
            if not (year <= (orig_year + years_max)):
                continue

        if years_min != -1000:
            if not (year >= (orig_year + years_min)):
                continue

        returndata.append(item)

    print("Filtered data length:" + str(len(returndata)))

    return returndata
Esempio n. 4
0
def enrich_cluster_data(cluster_data, good_metadata):
    returndata = []

    for item in cluster_data:
        item_document_id = str(item.get('documentID'))
        author = get_author_from_estc(item_document_id, good_metadata)
        year = get_year_from_estc(item_document_id, good_metadata)
        estcid = get_estcid_from_estc(item_document_id, good_metadata)
        new_item = {
            'document_id': str(item_document_id),
            'cluster_id': str(item.get('fragmentID')),
            'title': item.get('title'),
            'author': author,
            'year': year,
            'startIndex': item.get('startIndex'),
            'endIndex': item.get('endIndex'),
            'text': item.get('text'),
            'estc_id': estcid
        }
        returndata.append(new_item)

    print("return data length:" + str(len(returndata)))
    return returndata