Exemple #1
0
    def test_cacm(self):
        # We're going to append a random value to downloaded files:
        r = randint(0, 10000000)
        collection_url = 'https://github.com/castorini/anserini/blob/master/src/main/resources/cacm/cacm.tar.gz?raw=true'
        tarball_name = 'cacm{}.tar.gz'.format(r)
        collection_dir = 'collection{}/'.format(r)

        filename, headers = urlretrieve(collection_url, tarball_name)

        tarball = tarfile.open(tarball_name)
        tarball.extractall(collection_dir)
        tarball.close()

        collection = pycollection.Collection('HtmlCollection', collection_dir)
        generator = pygenerator.Generator('JsoupGenerator')

        cnt = 0
        for (i, fs) in enumerate(collection):
            for (j, doc) in enumerate(fs):
                parsed = generator.create_document(doc)
                docid = parsed.get('id')  # FIELD_ID
                raw = parsed.get('raw')  # FIELD_RAW
                contents = parsed.get('contents')  # FIELD_BODY
                self.assertTrue(docid != '')
                self.assertTrue(raw != '')
                self.assertTrue(contents != '')
                cnt += 1

        self.assertEqual(cnt, 3204)

        # Clean up
        os.remove(tarball_name)
        shutil.rmtree(collection_dir)
Exemple #2
0
def __documents_in_collection(config):
    collection = pycollection.Collection(config['pyserini_collection'],
                                         config['collection_directory'])
    generator = pygenerator.Generator(config['pyserini_generator'])

    for (_, fs) in enumerate(collection):
        for (_, document) in enumerate(fs):
            if not __document_is_parsable(document):
                continue

            ret = generator.create_document(document)
            if ret is not None:
                yield ret
def spawn_child_process_to_read_docs(data):
    target_doc_ids = data["doc_ids"]
    path = data["rootdir"]
    ctype = data["ctype"]
    shared_dict = data["shared_dict"]
    local_dict = {}
    start = time.time()
    from pyserini.collection import pycollection
    from pyserini.index import pygenerator

    collection = pycollection.Collection(ctype, path)
    generator = pygenerator.Generator("JsoupGenerator")
    for i, file_segment in enumerate(collection):
        doc_ids, doc_contents = read_file_segment(target_doc_ids, file_segment,
                                                  generator)
        for i, doc_id in enumerate(doc_ids):
            local_dict[doc_id] = doc_contents[i]
    shared_dict.update(local_dict)
    print(
        "PID: {0}, Done getting documents from disk: {1} for path: {2}".format(
            os.getpid(),
            time.time() - start, path))
Exemple #4
0
    def __iter__(self):
        from pyserini.collection import pycollection
        from pyserini.index import pygenerator

        path, ctype, gtype = self.get_path_and_types()
        # TODO change on pyserini upgrade
        if gtype == "WashingtonPostGenerator":
            gtype = "WapoGenerator"

        collection = pycollection.Collection(ctype, path)
        generator = pygenerator.Generator(gtype)

        for fs in collection:
            for doc in fs:
                parsed = None
                try:
                    parsed = generator.create_document(doc)
                except:
                    pass

                if parsed:
                    yield (parsed.get("id"), parsed.get("title"), parsed.get("contents"))
Exemple #5
0
from pyserini.collection import pycollection

collection = pycollection.Collection(
    'Cord19AbstractCollection',
    'C:\Users\noekn\Documents\Information_Retrieval')

cnt = 0
full_text = {True: 0, False: 0}

articles = collection.next()
for (i, d) in enumerate(articles):
    article = pycollection.Cord19Article(d.raw)
    cnt = cnt + 1
    full_text[article.is_full_text()] += 1
    if cnt % 1000 == 0:
        print(f'{cnt} articles read...')
Exemple #6
0
def IterCollection(input_path,
                   collection_class,
                   generator_class,
                   output_path,
                   threads=1,
                   tokenize=None,
                   tokenmin=0,
                   raw=False):
    """
    Parameters
    ----------
    input_path : str
        Path to input collection.
        
    collection_class : str
        Anserini collection class to use.
        
    generator_class : str
        Anserini generator class to use.
        
    output_path : str
        Path to output resulting json collection.
        
    threads : int
        Maximum number of threads.
        
    tokenize : str
        Name of tokenizer method to use for document-to-passage splitting.
        
    tokenmin : int
        Minimum limit argument (e.g. minword) passed to tokenizer
        
    raw : bool
        True for using raw (FIELD_RAW) document contents,
        False for using parsed (FIELD_BODY) document contents.
        
    """
    start = time.time()
    logger.info("Begin reading collection.")

    ## Check and create tokenizer
    tokenizer = None
    if tokenize is not None:
        try:
            tokenizer = DocumentTokenizer(tokenize)
        except:
            raise ValueError(tokenize)

    collection = pycollection.Collection(collection_class, input_path)
    generator = pygenerator.Generator(generator_class)

    if not os.path.exists(output_path):
        logger.info("making directory...")
        os.mkdir(output_path)

    with ThreadPoolExecutor(max_workers=threads) as executor:
        for (seg_num, fs) in enumerate(collection):
            executor.submit(IterSegment, fs, generator, output_path, tokenizer,
                            tokenmin, raw)

    end = time.time()
    elapsed = end - start

    print("all threads complete")
    logger.info("# Final Counter Values")
    logger.info("indexable:     {:12d}".format(
        collection.counters.indexable.value))
    logger.info("unindexable: {:12d}".format(
        collection.counters.unindexable.value))
    logger.info("skipped:     {:12d}".format(
        collection.counters.skipped.value))
    logger.info("errors:      {:12d}".format(collection.counters.errors.value))

    logger.info("Total duration: %s", str(datetime.timedelta(seconds=elapsed)))