Ejemplo n.º 1
0
def tokenize_dataset(root, spacy_model):
    nlp = spacy.load(spacy_model)

    for topic in sorted(os.listdir(root)):
        print('TOPIC:', topic)
        if os.path.exists(root / topic / 'articles.jsonl.gz'):
            articles = list(utils.read_jsonl_gz(root / topic / 'articles.jsonl.gz'))
        elif os.path.exists(root / topic / 'articles.jsonl'):
            articles = list(utils.read_jsonl(root / topic / 'articles.jsonl'))
        else:
            continue

        jsonl_out_path = root / topic / 'articles.tokenized.jsonl'
        out_batch = []
        for i, a in enumerate(articles):

            tokenized_doc = ''
            doc = nlp(a['text'])
            for sent in doc.sents:
                tokens = [tok.text for tok in sent if not tok.text.isspace()]
                tokenized_doc += ' '.join(tokens) + '\n'
            a['text'] = tokenized_doc.strip()
            out_batch.append(a)

            if i % 100 == 0:
                utils.write_jsonl(out_batch, jsonl_out_path, override=False)
                out_batch = []
                print(i)

        utils.write_jsonl(out_batch, jsonl_out_path, override=False)

        gz_out_path = root / topic / 'articles.tokenized.jsonl.gz'
        utils.gzip_file(jsonl_out_path, gz_out_path, delete_old=True)
Ejemplo n.º 2
0
 def times(self):
     articles = utils.read_jsonl(self.path / 'articles.preprocessed.jsonl')
     times = []
     for a in articles:
         t = arrow.get(a['time']).datetime
         t = t.replace(tzinfo=None)
         times.append(t)
     return times
Ejemplo n.º 3
0
 def _load_timelines(self):
     timelines = []
     for raw_tl in utils.read_jsonl(self.path / 'timelines.jsonl'):
         if raw_tl:
             tl_items = []
             for t, s in raw_tl:
                 t = self.normalise_time(arrow.get(t))
                 tl_items.append((t, s))
             tl = Timeline(tl_items)
             timelines.append(tl)
     return timelines
Ejemplo n.º 4
0
 def _load_timelines(self):
     timelines = []
     path = self.path / "timelines.jsonl"
     if not path.exists():
         return []
     for raw_tl in utils.read_jsonl(path):
         if raw_tl:
             tl_items = []
             for t, s in raw_tl:
                 t = self.normalise_time(arrow.get(t))
                 tl_items.append((t, s))
             tl = Timeline(tl_items)
             timelines.append(tl)
     return timelines
Ejemplo n.º 5
0
 def articles(self):
     path1 = self.path / "articles.preprocessed.jsonl"
     path2 = self.path / "articles.preprocessed.jsonl.gz"
     if path1.exists():
         articles = utils.read_jsonl(path1)
     else:
         articles = utils.read_jsonl_gz(path2)
     for a_ in articles:
         a = load_article(a_)
         t = self.normalise_time(a.time)
         if self.start and t < self.start:
             continue
         if self.end and t > self.end:
             break
         yield a
Ejemplo n.º 6
0
 def _load_timelines(self):
     timelines = []
     path = self.path / 'timelines.jsonl'
     with open("data_log.txt", "a") as ftmp:
         print(path, file=ftmp)
     if not path.exists():
         return []
     for raw_tl in utils.read_jsonl(path):
         if raw_tl:
             with open("data_log.txt", "a") as ftmp:
                 print(raw_tl, file=ftmp)
             tl_items = []
             for t, s in raw_tl:
                 t = self.normalise_time(arrow.get(t))
                 tl_items.append((t, s))
             tl = Timeline(tl_items)
             timelines.append(tl)
     return timelines