Ejemplo n.º 1
0
def create_index_ex(file_name):
    dic = Dictionary()
    csv_file = open(file_name)
    csv_reader = csv.reader(csv_file)
    rows = [row for row in csv_reader]  # csv表格中每一行都在rows中了
    # 每一个row结束之后进行merge
    for row_no in range(len(rows)):
        index = []  # 倒排索引表是一个列表
        row = rows[row_no]  # row是csv表格中的一行
        for string in row:  # string是csv表格中的一项
            words = language.text_process(string)
            words = language.get_dictionary_list(words)  # 完成分词
            # 把csv表格中一项的所有词项写入词典,并建立索引表
            for word in words:
                dic.add(word)
                word_ptr = dic.get_position(word)
                if word_ptr not in index:  # 若倒排索引表中没有这个词项
                    index.append(word_ptr)
        index = sorted(index)
        # 把index里面的term_ptr转为字符串,这一步是为了后面的join
        for i in range(len(index)):
            index[i] = str(index[i])
        # 读完每一行就创建一个文件来保存这一行中出现的term指针
        f_name = str(row_no) + '.txt'
        f_name = 'index/' + f_name
        f = open(f_name, 'w')
        doc_content = ','.join(index)
        # 指针列表也不会太大,我到时候直接读一个文件进来也不过分
        f.write(doc_content)
        f.close()
        # print(index)
    dic.write2file('dictionary.txt')
    return index
Ejemplo n.º 2
0
def creat_word_rel_dict(r_file, *q_files):
    word_dict = Dictionary()
    word_dict.add_unk_token()
    word_dict.add_pad_token()
    word_dict.add_start_token()

    for q_file in q_files:
        qa_data = pickle.load(open(q_file, 'rb'))
        for data in qa_data:
            q = data.question
            tokens = q.split(' ')
            for token in tokens:
                word_dict.add(token)
    print(len(word_dict))

    rels = pickle.load(open(r_file, 'rb'))
    for rel in rels:
        rel_word = []
        w = rel[3:].split('.')
        for i in w:
            rel_word.extend(i.split('_'))
        for word in rel_word:
            word_dict.add(word)
    print(len(word_dict))
    return word_dict
Ejemplo n.º 3
0
def load_type_dictionary(filename, word_dict=None):
    if word_dict is None:
        word_dict = Dictionary()
        word_dict.add_unk_token()
        word_dict.add_pad_token()
    data = pickle.load(open(filename, 'rb'))
    for ty in data:
        word_dict.add(ty)
    return word_dict
Ejemplo n.º 4
0
def get_postag_data(config,
                    train_path,
                    dev_path,
                    vocab_path=None,
                    label_path=None):
    use_se_marker = config.use_se_marker
    raw_train_sents = get_sentences(train_path, use_se_marker)
    raw_dev_sents = get_sentences(dev_path, use_se_marker)
    word_to_embeddings = get_pretrained_embeddings(
        WORD_EMBEDDINGS[config.word_embedding])

    # Prepare word dictionary.
    word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
    if use_se_marker:
        word_dict.add_all([START_MARKER, END_MARKER])
    if vocab_path != None:
        with open(vocab_path, 'r') as f_vocab:
            for line in f_vocab:
                word_dict.add(line.strip())
            f_vocab.close()
        word_dict.accept_new = False
        print 'Load {} words. Dictionary freezed.'.format(word_dict.size())

    # Parpare label dictionary.
    label_dict = Dictionary()
    if label_path != None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())

    train_sents = [(string_sequence_to_ids(sent[0], word_dict, True,
                                           word_to_embeddings),
                    string_sequence_to_ids(sent[1], label_dict))
                   for sent in raw_train_sents]
    dev_sents = [(string_sequence_to_ids(sent[0], word_dict, True,
                                         word_to_embeddings),
                  string_sequence_to_ids(sent[1], label_dict))
                 for sent in raw_dev_sents]

    print("Extracted {} words and {} tags".format(word_dict.size(),
                                                  label_dict.size()))
    print("Max training sentence length: {}".format(
        max([len(s[0]) for s in train_sents])))
    print("Max development sentence length: {}".format(
        max([len(s[0]) for s in dev_sents])))
    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (train_sents, dev_sents, word_dict, label_dict, [word_embedding],
            [word_embedding_shape])
Ejemplo n.º 5
0
def load_word_dictionary(filename, word_dict=None):
    if word_dict is None:
        word_dict = Dictionary()
        word_dict.add_unk_token()
        word_dict.add_pad_token()
    with open(filename) as f:
        for line in f:
            if not line: break
            line = line.strip()
            if not line: continue
            word_dict.add(line)
    return word_dict
Ejemplo n.º 6
0
def load_rel_separated_dictionary(filename):
    rel1_dict = Dictionary()
    rel1_dict.add_unk_token()
    rel1_dict.add_pad_token()
    rel2_dict = Dictionary()
    rel2_dict.add_unk_token()
    rel2_dict.add_pad_token()
    with open(filename) as f:
        for line in f:
            if not line: break
            line = line.strip()
            if not line: continue
            line = line.split('.')
            rel1 = '.'.join(line[:-1])
            rel2 = line[-1]
            rel1_dict.add(rel1)
            rel2_dict.add(rel2)
    return rel1_dict, rel2_dict
Ejemplo n.º 7
0
def get_srl_data(config,
                 train_data_path,
                 dev_data_path,
                 vocab_path=None,
                 label_path=None):
    '''
    '''
    use_se_marker = config.use_se_marker
    raw_train_sents = get_srl_sentences(train_data_path, use_se_marker)
    raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker)
    word_to_embeddings = get_pretrained_embeddings(
        WORD_EMBEDDINGS[config.word_embedding])  # get pre-trained embeddings

    # Prepare word dictionary.
    word_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    if use_se_marker:
        word_dict.add_all([START_MARKER, END_MARKER])
    if vocab_path != None:
        with open(vocab_path, 'r') as f_vocab:
            for line in f_vocab:
                word_dict.add(line.strip())
            f_vocab.close()
        word_dict.accept_new = False
        print 'Load {} words. Dictionary freezed.'.format(word_dict.size())

    # Parpare label dictionary.
    label_dict = Dictionary()
    if label_path != None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())

    # Get tokens and labels: [sentence_id, word, predicate, label]
    train_sentences_ids = [sent[0] for sent in raw_train_sents]
    train_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_train_sents
    ]
    train_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents
    ]

    if label_dict.accept_new:
        label_dict.set_unknown_token(
            UNKNOWN_LABEL)  # train corpus contains the label 'O' ?
        label_dict.accept_new = False

    dev_sentences_ids = [sent[0] for sent in raw_dev_sents]
    dev_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_dev_sents
    ]
    dev_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents
    ]
    print 'Total tokens in Dev dataset {}'.format(
        sum([len(sent[1]) for sent in raw_dev_sents]))
    # Get features
    print 'Extracting features'
    train_features, feature_shapes = features.get_srl_features(
        raw_train_sents, config)
    dev_features, feature_shapes2 = features.get_srl_features(
        raw_dev_sents, config)
    for f1, f2 in zip(feature_shapes, feature_shapes2):
        assert f1 == f2

    # For additional features. Unused now.
    feature_dicts = []
    for feature in config.features:
        feature_dicts.append(None)

    train_sents = []
    dev_sents = []
    for i in range(len(train_tokens)):
        train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) +
                           tuple(train_features[i]) + (train_labels[i], ))
    for i in range(len(dev_tokens)):
        dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) +
                         tuple(dev_features[i]) + (dev_labels[i], ))

    print("Extraced {} words and {} tags".format(word_dict.size(),
                                                 label_dict.size()))
    print("Max training sentence length: {}".format(
        max([len(s[1]) for s in train_sents])))
    print("Max development sentence length: {}".format(
        max([len(s[1]) for s in dev_sents])))
    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (train_sents, dev_sents, word_dict, label_dict,
            [word_embedding, None,
             None], [word_embedding_shape] + feature_shapes, [
                 word_dict,
             ] + feature_dicts)
def get_srl_data(config,
                 train_data_path,
                 dep_path,
                 dev_data_path,
                 vocab_path=None,
                 char_path=None,
                 label_path=None):
    # Load sentences (documents) from data paths respectively.
    raw_train_sents = get_srl_sentences(train_data_path)
    raw_dev_sents = get_srl_sentences(dev_data_path)
    # Load dev data
    eval_data = load_eval_data(dev_data_path)
    # Load pretrained embeddings
    word_embeddings = get_pretrained_embeddings(
        config.word_embedding)  # get pre-trained embeddings
    head_embeddings = get_pretrained_embeddings(config.head_embedding)

    # Prepare word embedding dictionary.
    word_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    # Prepare head embedding dictionary.
    head_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    # Prepare char dictionary.
    char_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    with open(char_path, 'r') as f_char:
        for line in f_char:
            char_dict.add(line.strip())
        f_char.close()
    char_dict.accept_new = False
    print 'Load {} chars, Dictionary freezed.'.format(char_dict.size())
    # Parpare SRL label dictionary.
    label_dict = Dictionary()
    label_dict.set_unknown_token(
        NULL_LABEL)  # train corpus contains the label 'O' ?
    if label_path is not None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(NULL_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())
    # Parpare SRL label dictionary.
    dep_label_dict = Dictionary()

    # Training data: Get tokens and labels: [sentence_id, word, predicate, label]
    train_samples = tokenize_data(raw_train_sents, word_dict, head_dict,
                                  char_dict, label_dict, False,
                                  word_embeddings, head_embeddings)
    # Data for dep Trees
    with Timer("Loading Dependency Trees"):
        dep_trees = SyntacticCONLL()
        dep_trees.read_from_file(dep_path, prune_ratio=config.dep_prune_ratio)
        dep_trees.tokenize_dep_trees(word_dict, char_dict, dep_label_dict,
                                     word_embeddings)

    # set dictionary freezed
    char_dict.accept_new, label_dict.accept_new, dep_label_dict.accept_new = False, False, False
    # Development data:
    dev_samples = tokenize_data(raw_dev_sents, word_dict, head_dict, char_dict,
                                label_dict, False, word_embeddings,
                                head_embeddings)

    # set word and head dict freezed.
    word_dict.accept_new, head_dict.accept_new = False, False

    print("Extract {} words and {} tags".format(word_dict.size(),
                                                label_dict.size()))
    print("Max training sentence length: {}".format(
        max([s[1] for s in train_samples])))
    print("Max development sentence length: {}".format(
        max([s[1] for s in dev_samples])))

    word_embedding = np.asarray(
        [word_embeddings[w] for w in word_dict.idx2str])
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    head_embedding = np.asarray(
        [head_embeddings[w] for w in head_dict.idx2str])
    head_embedding_shape = [len(head_embedding), len(head_embedding[0])]
    print("word embedding shape {}, head embedding shape {}".format(
        word_embedding_shape, head_embedding_shape))
    return (train_samples, dev_samples, dep_trees.sample_dep_data, eval_data,
            word_dict, head_dict, char_dict, label_dict, dep_label_dict,
            [word_embedding,
             head_embedding], [word_embedding_shape, head_embedding_shape])
Ejemplo n.º 9
0
from dictionary import Dictionary

rel_dict_path = "../../data/vocab/vocab.rel.pt"

# ent_dict = torch.load(ent_dict_path)
ent_dict = Dictionary()
rel_dict = torch.load(rel_dict_path)

dev = pickle.load(open("../relation_prediction/data/new_dev.pkl", "rb"))
test = pickle.load(open("../relation_prediction/data/new_test.pkl", "rb"))
train = pickle.load(open("../relation_prediction/data/new_train.pkl", "rb"))
with open("../../data/embed/valid.txt", "w") as f:
    for index, row in dev.iterrows():
        f.write(row["subject"] + "\t" + row["relation"] + "\t" +
                row["object"] + "\n")
        ent_dict.add(row["subject"])
        ent_dict.add(row["object"])
        rel_dict.add(row["relation"])

with open("../../data/embed/test.txt", "w") as f:
    for index, row in test.iterrows():
        f.write(row["subject"] + "\t" + row["relation"] + "\t" +
                row["object"] + "\n")
        ent_dict.add(row["subject"])
        ent_dict.add(row["object"])
        rel_dict.add(row["relation"])

with open("../../data/embed/train.txt", "w") as f:
    for index, row in train.iterrows():
        f.write(row["subject"] + "\t" + row["relation"] + "\t" +
                row["object"] + "\n")
Ejemplo n.º 10
0
            'Original Subject Name': row['subject_name'],
            'Normalized Edit Distance': normalized_edit_distance,
            'Question Tokens': row['question_tokens'],
        })

    exact_match = [
        d for d in print_data if d['Normalized Edit Distance'] == 1.0
    ]

    # get the word dictionary
    word_vocab = Dictionary()
    word_vocab.add_unk_token()
    word_vocab.add_pad_token()
    word_vocab.add_start_token()
    word_vocab.add_end_token()
    word_vocab.add("<e>")

    add_word(df_dev)
    add_word(df_test)
    add_word(df_train)

    torch.save(word_vocab, "../../data/vocab/word_vocab.pt")

    # get the training data and test data
    get_formatted_examples(128, '../../data/subject_recognition/dev.pt',
                           df_dev)
    get_formatted_examples(128, '../../data/subject_recognition/test.pt',
                           df_test)
    get_formatted_examples(128, '../../data/subject_recognition/train.pt',
                           df_train)
Ejemplo n.º 11
0
class Graph:
    def __init__(self):
        self.vertices = []
        self.adjList = Dictionary()

    def addVertex(self, v):
        self.vertices.append(v)
        self.adjList.add(v, [])

    def addEdge(self, v, w):
        self.adjList.get(v).append(w)
        self.adjList.get(w).append(v)

    def toString(self):
        stx = ''
        for i in range(0, len(self.vertices)):
            stx += self.vertices[i] + ' => '
            neighbors = self.adjList.get(self.vertices[i])
            for j in range(0, len(neighbors)):
                stx += neighbors[j] + ' '
            stx += '\n'
        print(stx)
        return stx

    # Breadth-First Search, BFS
    def bfs(self, v, callback):
        # Previous setting
        d = {}
        pred = {}
        color = {}

        for i in vertices:
            color[i] = 'white'
            d[i] = 0
            pred[i] = None

        queue = Queue()
        queue.enqueue(v)

        while not queue.isEmpty():
            u = queue.dequeue()
            neighbors = self.adjList.get(u)
            color[u] = 'grey'

            for n in neighbors:
                if color[n] == 'white':
                    color[n] = 'grey'
                    # Count distance and set pred
                    d[n] = d[u] + 1
                    pred[n] = u
                    queue.enqueue(n)
            color[u] = 'black'
        print('distance is =>', d)
        print('predecessors is =>', pred)
        return pred

    # Depth-First Search, DFS
    def dfs(self, df_u):
        df_color = {}
        d = {}  # discovered time of vertices
        f = {}  # completed time of vertices
        p = {}  # predecessor of vertices
        time = 0
        for i in vertices:
            df_color[i] = 'white'
        for i in vertices:
            f[i] = 0
            d[i] = 0
            p[i] = None
        self.dfsVisit(df_u, df_color, d, f, p, time)
        print('discovery is =>', d)
        print('finished is =>', f)
        print('predecessor is =>', p)

    def dfsVisit(self, u, color, d, f, p, time):
        print('discovered => ', u)
        color[u] = 'grey'
        time += 1
        d[u] = time
        neighbors = self.adjList.get(u)
        for i in range(0, len(neighbors)):
            w = neighbors[i]
            if color[w] == 'white':
                p[w] = u
                self.dfsVisit(w, color, d, f, p, time)
        color[u] = 'black'
        time += 1
        f[u] = time
        print('explored => ', u)
Ejemplo n.º 12
0
class TestDictionary(unittest.TestCase):
    def setUp(self):
        self.d = Dictionary()

    def test_010_is_empty_when_created(self):
        self.assertEqual(self.d.entries(), {})

    def test_020_can_add_whole_entries_with_keyword_and_definition(self):
        self.d.add('fish', 'aquatic animal')
        self.assertEqual(self.d.entries(), {'fish': 'aquatic animal'})
        self.assertEqual(self.d.keywords(), ['fish'])

    def test_030_add_keywords_without_definition(self):
        self.d.add('fish')
        self.assertEqual(self.d.entries(), {'fish': None})
        self.assertEqual(self.d.keywords(), ['fish'])

    def test_040_can_check_whether_a_given_keyword_exists(self):
        self.assertFalse(self.d.includes('fish'))

    def test_050_doesnt_cheat_when_checking_whether_keyword_exists(self):
        self.assertFalse(self.d.includes('fish'))
        self.d.add('fish')
        self.assertTrue(self.d.includes('fish'))
        self.assertFalse(self.d.includes('bird'))

    def test_060_doesnt_include_a_prefix_in_and_of_itself(self):
        self.d.add('fish')
        self.assertFalse(self.d.includes('fi'))

    def test_070_doesnt_find_a_word_in_an_empty_dictionary(self):
        self.assertFalse(self.d.includes('fi'))

    def test_080_finds_nothing_if_the_prefix_matches_nothing(self):
        self.d.add('fiend')
        self.d.add('great')
        self.assertEqual(len(self.d.find('nothing')), 0)

    def test_090_finds_an_entry(self):
        self.d.add('fish', 'aquatic animal')
        self.assertEqual(self.d.find('fish'), {'fish': 'aquatic animal'})

    def test_100_finds_multiple_matches_from_a_prefix_and_returns_the_entire_entry(
            self):
        self.d.add('fish', 'aquatic animal')
        self.d.add('fiend', 'wicked person')
        self.d.add('great', 'remarkable')
        self.assertEqual(self.d.find('fi'), {
            'fish': 'aquatic animal',
            'fiend': 'wicked person'
        })

    def test_110_lists_keywords_alphabetically(self):
        self.d.add('zebra', 'African land animal with stripes')
        self.d.add('fish', 'aquatic animal')
        self.d.add('apple', 'fruit')
        self.assertEqual(self.d.keywords(), ['apple', 'fish', 'zebra'])

    def test_120_can_produce_printable_output(self):
        self.d.add('zebra', 'African land animal with stripes')
        self.d.add('fish', 'aquatic animal')
        self.d.add('apple', 'fruit')
        should_str = """[apple] \"fruit\"
[fish] \"aquatic animal\"
[zebra] \"African land animal with stripes\""""
        self.assertEqual(self.d.printable(), should_str)