Esempio n. 1
0
def tokenize_dataset(root, spacy_model):
    nlp = spacy.load(spacy_model)

    for topic in sorted(os.listdir(root)):
        print('TOPIC:', topic)
        if os.path.exists(root / topic / 'articles.jsonl.gz'):
            articles = list(utils.read_jsonl_gz(root / topic / 'articles.jsonl.gz'))
        elif os.path.exists(root / topic / 'articles.jsonl'):
            articles = list(utils.read_jsonl(root / topic / 'articles.jsonl'))
        else:
            continue

        jsonl_out_path = root / topic / 'articles.tokenized.jsonl'
        out_batch = []
        for i, a in enumerate(articles):

            tokenized_doc = ''
            doc = nlp(a['text'])
            for sent in doc.sents:
                tokens = [tok.text for tok in sent if not tok.text.isspace()]
                tokenized_doc += ' '.join(tokens) + '\n'
            a['text'] = tokenized_doc.strip()
            out_batch.append(a)

            if i % 100 == 0:
                utils.write_jsonl(out_batch, jsonl_out_path, override=False)
                out_batch = []
                print(i)

        utils.write_jsonl(out_batch, jsonl_out_path, override=False)

        gz_out_path = root / topic / 'articles.tokenized.jsonl.gz'
        utils.gzip_file(jsonl_out_path, gz_out_path, delete_old=True)
Esempio n. 2
0
def preprocess_dataset(root, nlp):
    ftmp = open("log.txt", "w")

    article_path = root / 'articles.tokenized.jsonl.gz'
    articles = utils.read_jsonl_gz(article_path)
    h_output_dir = root / 'demo_time_annotated'
    out_path = root / 'demo_articles.preprocessed.jsonl'
    out_batch = []
    i = 0

    for old_a, timeml_raw in read_articles(articles, h_output_dir):
        print("old_a: {}\ntimeml_raw: {}\n".format(old_a, timeml_raw), file=ftmp)
        a = preprocess_article(old_a, timeml_raw, nlp)

        if a:
            out_batch.append(a.to_dict())
        else:
            date = arrow.get(old_a['time']).date()
            print('cannot process:', date, old_a['id'])

        if i % 100 == 0:
            print('writing batch,', i, 'articles done')
            if i == 0:
                utils.write_jsonl(out_batch, out_path, override=True)
            else:
                utils.write_jsonl(out_batch, out_path, override=False)
            out_batch = []
        i += 1

    utils.write_jsonl(out_batch, out_path, override=False)
    gz_path = str(out_path) + '.gz'
    utils.gzip_file(inpath=out_path, outpath=gz_path, delete_old=True)
def preprocess_dataset(root, nlp):

    for topic in sorted(os.listdir(root)):
        print("TOPIC:", topic)

        article_path = root / topic / "articles.tokenized.jsonl.gz"
        articles = utils.read_jsonl_gz(article_path)
        h_output_dir = root / topic / "time_annotated"
        out_path = root / topic / "articles.preprocessed.jsonl"
        out_batch = []
        i = 0

        for old_a, timeml_raw in read_articles(articles, h_output_dir):
            a = preprocess_article(old_a, timeml_raw, nlp)

            if a:
                out_batch.append(a.to_dict())
            else:
                date = arrow.get(old_a["time"]).date()
                print("cannot process:", date, old_a["id"])

            if i % 100 == 0:
                print("writing batch,", i, "articles done")
                if i == 0:
                    utils.write_jsonl(out_batch, out_path, override=True)
                else:
                    utils.write_jsonl(out_batch, out_path, override=False)
                out_batch = []
            i += 1

        utils.write_jsonl(out_batch, out_path, override=False)
        gz_path = str(out_path) + ".gz"
        utils.gzip_file(inpath=out_path, outpath=gz_path, delete_old=True)
Esempio n. 4
0
def preprocess_dataset(root, nlp):
    for topic in sorted(os.listdir(root)):
        print('TOPIC:', topic)
        if topic == '.DS_Store':
            continue

        article_path = root / topic / 'articles.tokenized.jsonl.gz'
        articles = utils.read_jsonl_gz(article_path)
        h_output_dir = root / topic / 'time_annotated'
        out_path = root / topic / 'articles.preprocessed.jsonl'
        out_batch = []
        i = 0

        for old_a, timeml_raw in read_articles(articles, h_output_dir):
            a = preprocess_article(old_a, timeml_raw, nlp)

            if a:
                out_batch.append(a.to_dict())
            else:
                date = arrow.get(old_a['time']).date()
                print('cannot process:', date, old_a['id'])

            if i % 100 == 0:
                print('writing batch,', i, 'articles done')
                if i == 0:
                    utils.write_jsonl(out_batch, out_path, override=True)
                else:
                    utils.write_jsonl(out_batch, out_path, override=False)
                out_batch = []
            i += 1

        utils.write_jsonl(out_batch, out_path, override=False)
        gz_path = str(out_path) + '.gz'
        utils.gzip_file(inpath=out_path, outpath=gz_path, delete_old=False)
Esempio n. 5
0
 def articles(self):
     path1 = self.path / "articles.preprocessed.jsonl"
     path2 = self.path / "articles.preprocessed.jsonl.gz"
     if path1.exists():
         articles = utils.read_jsonl(path1)
     else:
         articles = utils.read_jsonl_gz(path2)
     for a_ in articles:
         a = load_article(a_)
         t = self.normalise_time(a.time)
         if self.start and t < self.start:
             continue
         if self.end and t > self.end:
             break
         yield a
Esempio n. 6
0
    def time_batches(self):
        articles = utils.read_jsonl_gz(self.path /
                                       'articles.preprocessed.jsonl.gz')
        time = None
        batch = []
        for a_ in articles:
            a = load_article(a_)
            a_time = self.normalise_time(a.time)

            if self.start and a_time < self.start:
                continue

            if self.end and a_time > self.end:
                break

            if time and a_time > time:
                yield time, batch
                time = a_time
                batch = [a]
            else:
                batch.append(a)
                time = a_time
        yield time, batch
def heideltime_preprocess(dataset_dir, heideltime_path):
    apply_heideltime = heideltime_path / 'apply-heideltime.jar'
    heideltime_config = heideltime_path / 'config.props'

    for topic in os.listdir(dataset_dir):
        print('TOPIC:', topic)

        articles = utils.read_jsonl_gz(dataset_dir / topic / 'articles.tokenized.jsonl.gz')

        out_dir = dataset_dir / topic / 'time_annotated'
        utils.force_mkdir(out_dir)
        write_input_articles(articles, out_dir)

        subprocess.run([
            'java',
            '-jar',
            str(apply_heideltime),
            str(heideltime_config),
            str(out_dir),
            'txt'
        ])

        delete_input_articles(articles, out_dir)