Ejemplo n.º 1
0
 def get_desc(doc_bytearray):
     doc = document_pb2.Document().FromString(pickle.loads(doc_bytearray))
     try:
         return '{}: {}.'.format(
             doc.doc_name, doc.document_contents[0].text.split(".")[0])
     except:
         return '{}: .'.format(doc.doc_name)
Ejemplo n.º 2
0
def build_qrels(spark,
                parquet_path,
                qrels_path,
                doc_count=1000,
                qrels_type='tree'):
    """ Build qrels (tree or hierarchical) from 'doc_count' number of preprocessed document data. """
    # Read processed df - each row in a TREC CAR Page and preprocessed protobuf message.
    df = spark.read.parquet(parquet_path)

    # Sample preprocessed data.
    fraction = (doc_count / df.count()) + 0.001
    df_sample = df.sample(withReplacement=False, fraction=fraction)

    # Unpack list of preprocessed data.
    doc_bytearray_list = df_sample.limit(doc_count).select(
        'doc_bytearray').collect()
    document_list = [
        document_pb2.Document().FromString(pickle.loads(doc_bytearray[0]))
        for doc_bytearray in doc_bytearray_list
    ]

    # Build qrels (tree or hierarchical).
    build_synthetic_qrels(document_list=document_list,
                          path=qrels_path,
                          qrels_type=qrels_type)
Ejemplo n.º 3
0
def write_content_data_to_dir(spark,
                              read_path,
                              dir_path,
                              num_contents=1,
                              chunks=10000,
                              write_output=True):
    """ Create document content parquet DF by unpacking preprocessed document data."""
    # Create new dir to store data chunks
    if (os.path.isdir(dir_path) == False) and write_output:
        print('making dir: {}'.format(dir_path))
        os.mkdir(dir_path)

    # Read preprocessed document data.
    df = spark.read.parquet(read_path)
    n = int(df.select("index").rdd.max()[0])
    content_data = []
    chunk = 0
    t_start = time.time()
    # Write chunks of data to files.
    for i in range(0, n + 1, chunks):

        # stops when 'num_pages' processed
        if i >= num_contents:
            break

        for df_doc in df.where(df.index.between(i, i + chunks)).collect():
            doc_id = df_doc[0]
            dataset = df_doc[1]
            doc = document_pb2.Document().FromString(pickle.loads(df_doc[3]))
            for doc_content in doc.document_contents:
                # add bytearray of trec_car_tool.Page object
                content_data.append([
                    str(doc_content.content_id),
                    str(doc_content.content_type), doc_id, dataset,
                    bytearray(pickle.dumps(doc_content.SerializeToString()))
                ])

        if write_output:
            print('----- STEP {} -----'.format(i))
            time_delta = time.time() - t_start
            print('time elapse: {} --> time / page: {}'.format(
                time_delta, time_delta / (i + 1)))
            write_to_parquet_content(data=content_data,
                                     dir_path=dir_path,
                                     chunk=chunk)

            # begin new list
            content_data = []
            chunk += 1

    if write_output and (len(content_data) > 0):
        print('WRITING FINAL FILE: {}'.format(i))
        write_to_parquet_content(data=content_data,
                                 dir_path=dir_path,
                                 chunk=chunk)

    time_delta = time.time() - t_start
    print('PROCESSED DATA: {} --> processing time / page: {}'.format(
        time_delta, time_delta / (i + 1)))
Ejemplo n.º 4
0
 def get_top_ents(doc_bytearray):
     synthetic_entity_link_totals = document_pb2.Document().FromString(
         pickle.loads(doc_bytearray)).synthetic_entity_link_totals
     link_counts = []
     for synthetic_entity_link_total in synthetic_entity_link_totals:
         entity_id = str(synthetic_entity_link_total.entity_id)
         count = sum([
             i.frequency
             for i in synthetic_entity_link_total.anchor_text_frequencies
         ])
         link_counts.append((entity_id, count))
     return [
         i[0] for i in sorted(link_counts, key=lambda x: x[1], reverse=True)
     ][:9]
    def parse_article_to_protobuf(self, article):
        """ """

        # Initialise empty message.
        self.document = document_pb2.Document()
        self.document.doc_id = article['id']
        self.document.doc_name = article['title']

        document_content = document_pb2.DocumentContent()
        document_content.content_id = article['id']
        document_content.content_type = 1
        document_content.text = article['text']

        self.document.document_contents.append(document_content)

        self.__add_rel_entity_links()

        self.__add_entity_link_totals()

        return self.document
Ejemplo n.º 6
0
 def get_first_para(doc_bytearray):
     doc = document_pb2.Document().FromString(pickle.loads(doc_bytearray))
     try:
         return str(doc.document_contents[0].text)
     except:
         return ""