Beispiel #1
0
    def __load__data__(self, data):
        if(isinstance(data, Category)):
            if hasattr(data, "_id"):
                self._id = data._id
            self.name = data.name
            if hasattr(data, "domain"):
                self.domain = data.domain
            self.patterns = data.patterns
            self.sentences = data.sentences
            self.sentence_ids = data.sentence_ids
        else:
            self._id = data["_id"]
            self.name = data["name"]

            self.domain = data.get("domain")
            if self.domain is None: self.domain = "none"
            patterns = data.get("patterns")
            if patterns is None: patterns = []
            for pat in patterns:
                self.patterns.append(pat)
            self.sentence_ids = data.get("sentences")
            self.sentences = []
            if self.sentence_ids is None:
                self.sentence_ids = []
            for id in self.sentence_ids:
                data = Database.find_one(Database.COLLECTIONS.SENTENCE, {"_id": id})
                s = Sentence()
                s.load_from_dict(data)
                self.sentences.append(s)
Beispiel #2
0
def read_sentence(line, position):
	infos = line.split(";;")
	nb_clause = int(infos[0])
	text = infos[-1].strip()
	propositions = [prop.split("\t") for prop in infos[1:len(infos)-1]]
	
	return Sentence(propositions, text, nb_clause, position)
Beispiel #3
0
def main() -> None:
    in_dir = Path.home() / 'Research/hyperpartisan_news'
    out_dir = Path('../../data/interim/news/train')
    Path.mkdir(out_dir, parents=True, exist_ok=True)
    corpus = in_dir / 'articles-training-bypublisher-20181122.xml'
    metadata = in_dir / 'ground-truth-training-bypublisher-20181122.xml'
    # dev_corpus = in_dir / 'articles-validation-bypublisher-20181122.xml'
    # dev_metadata = in_dir / 'ground-truth-validation-bypublisher-20181122.xml'
    data = parse_xml(corpus, metadata)

    processor = stanza.Pipeline(lang='en',
                                processors='tokenize',
                                tokenize_batch_size=4096)
    # processor = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'})

    for part_index, some_docs in tqdm(enumerate(partition(data, 100)),
                                      total=100,
                                      desc='Total'):
        for doc in tqdm(some_docs, desc='Chunk'):
            processed = processor(doc.text)
            doc.sentences = [
                Sentence([token.text for token in stanza_sent.tokens])
                for stanza_sent in processed.sentences
            ]
        with open(out_dir / f'tokenized_{part_index}.pickle', 'wb') as file:
            pickle.dump(some_docs, file, protocol=-1)
 def _preprocess_sents(self, raw_sents):
     processed_sents = []
     for s in raw_sents:
         processed_sent = Sentence(text=s,
                                   words=word_tokenize(s),
                                   position=None)
         processed_sents.append(processed_sent)
     return processed_sents
Beispiel #5
0
 def sentence_break(self):
     if len(self.texts) == 0:
         return
     if self.config.iobes:
         self.tags = iob_to_iobes(self.tags)
     tokens = [Token(t, g) for t, g in zip(self.texts, self.tags)]
     self.document.add_child(Sentence(tokens=tokens))
     self.texts = []
     self.tags = []
Beispiel #6
0
def make_document(token_texts, label):
    """Return Document object initialized with given token texts."""
    tokens = [Token(t) for t in token_texts]
    # We don't have sentence splitting, but the data structure expects
    # Documents to contain Sentences which in turn contain Tokens.
    # Create a dummy sentence containing all document tokens to work
    # around this constraint.
    sentences = [Sentence(tokens=tokens)]
    return Document(target_str=label, sentences=sentences)
def draw_weighted_alignment_from_file(alignment_path, french_path, english_path, output_file, sure=False, sentence_id=1):
    """
    Draws an alignment that is weighted according to probs of alignment.
    We use the last column (4th) for the probability of alignment.
    """
    french = _read_sentences_from_file(french_path, sentence_id)
    english = _read_sentences_from_file(english_path, sentence_id)
    # alignments, _ = _read_alignment_from_file(naacl_path, sentence_id, sure)
    alignments, prediction_weights = _read_alignment_from_file(alignment_path, sentence_id, sure, weighted=True)

    draw_alignment(alignments, prediction_weights, Sentence(sentence_id, english, french), output_file)
    def _preprocess(self, articles):
        sent_splitter = SentenceSplitter()
        processed_articles = []
        for a in articles:
            body_sents = sent_splitter.split_sents(a['text'])
            processed_title = Sentence(text=a['title'],
                                       words=word_tokenize(a['title']),
                                       position=-1,
                                       is_title=True)
            processed_sents = []
            for position, s in enumerate(body_sents):
                processed_sent = Sentence(text=s,
                                          words=word_tokenize(s),
                                          position=position)
                processed_sents.append(processed_sent)

            processed_article = Article(processed_title, processed_sents)
            processed_articles.append(processed_article)

        return processed_articles
def draw_alignment_from_file(naacl_path, french_path, english_path, file_name: str, sure=False, sentence_id=1):
    """
    input:
        naacl_path, file with gold alignments
        french_path, french sentences
        english_path, enlgish sentences
        fig_path, output figure path
        sure, print sure alignments
        sentence, position id of sentence to print form the corpus

    """
    french = _read_sentences_from_file(french_path, sentence_id)
    english = _read_sentences_from_file(english_path, sentence_id)
    alignments, _ = _read_alignment_from_file(naacl_path, sentence_id, sure, weighted=False)

    draw_alignment(alignments, [], Sentence(sentence_id, english, french), file_name)
Beispiel #10
0
    def loadSrc(self):
        corpus = self.corpus
        src = self.src

        tokens = []
        id = 0
        with open(src, 'r') as fin:
            for line in fin.readlines():
                if line == '\n':
                    tmp_tokens = list(tokens)
                    le = len(tmp_tokens)
                    for i, token in enumerate(tmp_tokens):
                        h_id = token.h_id
                        h_rel = token.rel

                        if i < le - 1:
                            tmp_tokens[i].add_d_id_rel(i + 1, '@+1@')

                        if i > 0:
                            tmp_tokens[i].add_u_id_rel(i - 1, '@-1@')

                        if h_id != -1:
                            tmp_tokens[h_id].add_d_id_rel(i, h_rel)

                    sent = Sentence(tmp_tokens)
                    corpus.append(sent)
                    tokens = []

                    id = 0
                else:
                    items = line.strip().split()
                    t_str = items[0]
                    h_id = int(items[1])
                    rel = items[2]
                    label = items[3]
                    token = Token(id, t_str, h_id, rel, label)
                    tokens.append(token)
                    id += 1
        return corpus
Beispiel #11
0
    def test_sentences(self):
        print('test sentences')
        texts = ["a x d z w w a", "a b c d w x y z", "w w x x a a"]
        sentences = [Sentence(text, self.vocab) for text in texts]
        tensors = [sentence.to_tensor() for sentence in sentences]

        print('vocab')
        print(self.vocab.w2id)

        print('encode')
        tokens = [self.vocab.encode(text) for text in texts]
        print(tokens)

        print('decode')
        decoded = [self.vocab.decode(l) for l in tokens]
        print(decoded)

        print('sentences')
        print(sentences)
        print('tensors')
        print(tensors)
        return sentences
Beispiel #12
0
def train(**kwargs):
    print(kwargs)
    start = time.time()
    # 根据命令行参数更新配置
    vis = Visualizer(opt.env)
    opt.parse(kwargs)

    # 加载词向量
    print("Loading word vectors...Please wait.")
    vector = KeyedVectors.load_word2vec_format(
        os.path.join(os.path.dirname(os.path.realpath(opt.train_data_root)), 'vector.txt')
    )
    print("Successfully loaded word vectors.")

    # step1: 模型
    model = getattr(models, opt.model)(input_size=vector.vector_size+2, output_size=opt.class_num)
    if opt.load_model_path:
        model.load(opt.load_model_path) # 预加载
    if opt.use_gpu and t.cuda.is_available():
        model = model.cuda()
    print(f"Structure of {model.model_name}:\n{model}\n")

    # step2: 数据
    train_data = Sentence(root=opt.train_data_root, 
                          relations=opt.relations, 
                          max_length=opt.max_length,
                          vector=vector,
                          train=True)   # 训练集
    train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True) 

    val_data = Sentence(opt.train_data_root, opt.relations, opt.max_length, vector, train=False)  # 验证集
    val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True)

    # step3: 目标函数和优化器
    loss_fn = t.nn.CrossEntropyLoss()
    lr = opt.lr
    optimizer = t.optim.Adam(params=model.parameters(), 
                             lr=lr,
                             weight_decay = opt.weight_decay)

    # step4: 统计指标:平滑处理之后的损失,还有混淆矩阵
    loss_meter = meter.AverageValueMeter()
    confusion_matrix = meter.ConfusionMeter(opt.class_num)
    previous_loss = 1e100
    
    # 训练
    for epoch in range(opt.max_epoch):
        
        loss_meter.reset()
        confusion_matrix.reset()

        for ii, (data, label) in enumerate(train_dataloader):

            # 训练模型参数 
            input = data
            target = label
            if opt.use_gpu:
                input = input.cuda()
                target = target.cuda()
            optimizer.zero_grad()
            prediction = model(input)
            loss = loss_fn(prediction, target)
            loss.backward()
            optimizer.step()
            
            # 更新统计指标以及可视化
            loss_meter.add(loss.item())
            confusion_matrix.add(prediction.data, target.data)

            # if ii % opt.print_freq == opt.print_freq - 1:
            #     vis.plot('train loss', loss_meter.value()[0])
                
                # 如果需要的话,进入debug模式
                # if os.path.exists(opt.debug_file):
                #     import ipdb;
                #     ipdb.set_trace()
        cm_value = confusion_matrix.value()
        correct = 0
        for i in range(cm_value.shape[0]):
            correct += cm_value[i][i]
        accuracy = 100. * correct / (cm_value.sum())

        vis.plot('train loss', loss_meter.value()[0])
        vis.plot('train accuracy', accuracy)

        if epoch % opt.save_epoch == opt.save_epoch -1:
            model.save()

        # 计算验证集上的指标及可视化
        val_lm, val_cm, val_accuracy = val(model, val_dataloader)
        vis.plot('val loss', val_lm.value()[0])
        vis.plot('val accuracy', val_accuracy)

        
        print("epoch:{epoch}, lr:{lr}, loss:{loss}\ntrain_cm:\n{train_cm}\nval_cm:\n{val_cm}"
                .format(epoch=epoch,
                        loss=loss_meter.value()[0],
                        val_cm=str(val_cm.value()),
                        train_cm=str(confusion_matrix.value()),
                        lr=lr)
        )
        
        # 如果损失不再下降,则降低学习率
        if loss_meter.value()[0] > previous_loss:          
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
                
        previous_loss = loss_meter.value()[0]

    cost = int(time.time()) - int(start)
    print(f"Cost {int(cost/60)}min{cost%60}s.")
Beispiel #13
0
def main(sessions: Iterable[int], in_dir: Path, out_dir: Path,
         subsampling_implementation: Optional[str],
         subsampling_threshold: float, min_word_freq: int, min_sent_len: int,
         fixed_sent_len: int, eval_min_freq: int,
         eval_R_thresholds: Iterable[float], eval_num_random_samples: int,
         conserve_RAM: bool) -> None:
    Path.mkdir(out_dir, parents=True, exist_ok=True)
    preview = open(out_dir / f'preview.txt', 'w')
    print(f'Reading sessions {sessions}. Writing to {out_dir}')
    print(f'Reading sessions {sessions}. Writing to {out_dir}', file=preview)
    print(f'Min word frequency = {min_word_freq}', file=preview)
    print(f'Min sentence length = {min_sent_len}', file=preview)
    print(f'Faux sentence fixed length = {fixed_sent_len}', file=preview)
    print(f'SGNS subsample implementation= {subsampling_implementation}',
          file=preview)
    print(f'SGNS subsample threshold = {subsampling_threshold}', file=preview)

    corpus: List[LabeledDoc] = []
    norm_freq: Counter[str] = Counter()
    for session in tqdm(
            sessions,
            desc='Loading multi-word expression underscored pickles...'):
        for party in ('D', 'R'):
            in_path = in_dir / f'underscored_{party}{session}.txt'
            with open(in_path) as underscored_corpus:
                for line in underscored_corpus:
                    underscored_tokens = line.split()
                    norm_freq.update(underscored_tokens)
                    corpus.append(
                        LabeledDoc(uid=None,
                                   title=None,
                                   url=None,
                                   party=party,
                                   referent=None,
                                   text=underscored_tokens,
                                   date=None,
                                   sentences=[]))
    cumulative_freq = sum(freq for freq in norm_freq.values())
    print(f'Noramlized vocabulary size = {len(norm_freq):,}', file=preview)
    print(f'Number of words = {cumulative_freq:,}', file=preview)

    # Filter counter with MIN_FREQ and count UNK
    UNK_filtered_freq: Counter[str] = Counter()
    for key, val in norm_freq.items():
        if val >= min_word_freq:
            UNK_filtered_freq[key] = val
        else:
            UNK_filtered_freq['[UNK]'] += val
    print(f'Filtered vocabulary size = {len(UNK_filtered_freq):,}',
          file=preview)
    assert sum(freq for freq in norm_freq.values()) == cumulative_freq

    # Subsampling & filter by min/max sentence length
    keep_prob = subsampling(UNK_filtered_freq, subsampling_implementation,
                            subsampling_threshold)
    ground: Dict[str, GroundedWord] = {}
    final_freq: Counter[str] = Counter()
    for doc in tqdm(corpus, desc='Subsampling frequent words'):
        subsampled_words = []
        for token in doc.text:
            if token in discard:
                continue
            if token not in UNK_filtered_freq:
                token = '[UNK]'
            if random.random() < keep_prob[token]:
                subsampled_words.append(token)

        for faux_sent in faux_sent_tokenize(subsampled_words, fixed_sent_len,
                                            min_sent_len):
            final_freq.update(faux_sent)
            doc.sentences.append(Sentence(subsampled_tokens=faux_sent))
            for word in faux_sent:
                if word not in ground:
                    ground[word] = GroundedWord(text=word,
                                                deno=None,
                                                cono=Counter({doc.party: 1}))
                else:
                    ground[word].cono[doc.party] += 1

        if conserve_RAM:
            doc.text = None
    # End looping documents
    print(f'Final vocabulary size = {len(final_freq):,}', file=preview)
    print(
        f'Subsampled number of words = '
        f'{sum(freq for freq in final_freq.values()):,}',
        file=preview)

    # Filter out empty documents
    corpus = [doc for doc in corpus if len(doc.sentences) > 0]

    # Numericalize corpus by word_ids
    word_to_id, id_to_word = build_vocabulary(final_freq)
    for doc in tqdm(corpus, desc='Converting to word ids'):
        for sent in doc.sentences:
            sent.numerical_tokens = [
                word_to_id[token] for token in sent.subsampled_tokens
            ]
            if conserve_RAM:
                sent.subsampled_tokens = None

    # Prepare grounding for intrinsic evaluation
    random_eval_words = set()
    for gw in ground.values():
        gw.majority_cono = gw.cono.most_common(1)[0][0]
        gw.freq = sum(gw.cono.values())
        gw.R_ratio = gw.cono['R'] / gw.freq
        if gw.freq >= eval_min_freq:
            random_eval_words.add(gw.text)
    random_eval_words = random.sample(random_eval_words,
                                      eval_num_random_samples)
    with open(out_dir / f'eval_words_random.txt', 'w') as file:
        file.write('\n'.join(random_eval_words))

    for R_threshold in eval_R_thresholds:
        D_threshold = 1 - R_threshold
        partisan_eval_words = []
        for gw in ground.values():
            if gw.freq >= eval_min_freq:
                if gw.R_ratio >= R_threshold or gw.R_ratio <= D_threshold:
                    partisan_eval_words.append(gw)
        print(
            f'{len(partisan_eval_words)} partisan eval words '
            f'with R_threshold = {R_threshold}',
            file=preview)

        out_path = out_dir / f'inspect_{R_threshold}_partisan.tsv'
        with open(out_path, 'w') as file:
            print('word\tfreq\tR_ratio', file=file)
            for gw in partisan_eval_words:
                print(gw.text, gw.freq, gw.R_ratio, sep='\t', file=file)

        if len(partisan_eval_words) > 2 * eval_num_random_samples:
            partisan_eval_words = random.sample(partisan_eval_words,
                                                2 * eval_num_random_samples)
        else:
            random.shuffle(partisan_eval_words)

        mid = len(partisan_eval_words) // 2
        with open(out_dir / f'{R_threshold}partisan_dev_words.txt',
                  'w') as file:
            for gw in partisan_eval_words[:mid]:
                print(gw.text, file=file)
        with open(out_dir / f'{R_threshold}partisan_test_words.txt',
                  'w') as file:
            for gw in partisan_eval_words[mid:]:
                print(gw.text, file=file)

    # Helper for negative sampling
    cumulative_freq = sum(freq**0.75 for freq in final_freq.values())
    negative_sampling_probs: Dict[int, float] = {
        word_to_id[word]: (freq**0.75) / cumulative_freq
        for word, freq in final_freq.items()
    }
    vocab_size = len(word_to_id)
    negative_sampling_probs: List[float] = [
        # negative_sampling_probs[word_id]  # strict
        negative_sampling_probs.get(word_id, 0)  # prob = 0 if missing vocab
        for word_id in range(vocab_size)
    ]

    random.shuffle(corpus)
    cucumbers = {
        'word_to_id': word_to_id,
        'id_to_word': id_to_word,
        'ground': ground,
        'negative_sampling_probs': negative_sampling_probs,
        'documents': corpus
    }
    print(f'Writing to {out_dir}')
    with open(out_dir / 'train.pickle', 'wb') as out_file:
        pickle.dump(cucumbers, out_file, protocol=-1)

    # Print out vocabulary & some random sentences for sanity check
    docs = random.sample(corpus, 100)
    preview.write('\n')
    for doc in docs:
        sent = doc.sentences[0]
        if not conserve_RAM:
            # print(sent.tokens, file=preview)
            # print(sent.normalized_tokens, file=preview)
            print(sent.subsampled_tokens, file=preview)
            print(sent.numerical_tokens, file=preview, end='\n\n')
        else:
            print(sent.numerical_tokens, file=preview)
            # print(vars(doc), end='\n\n', file=preview)
    preview.write('\n\nfinal_freq\tword\n')
    for key, val in final_freq.most_common():
        print(f'{val:,}\t{ground[key]}', file=preview)
    preview.close()
    print('All set!')
Beispiel #14
0
def inputFromSentence(sentence : Sentence, max_length : int) -> Sentence:
    if len(sentence) > max_length:
        sentence = sentence[:max_length]
    if len(sentence) < max_length:
        sentence.extend([0] * (max_length - len(sentence)))
    return sentence