def create_index_ex(file_name): dic = Dictionary() csv_file = open(file_name) csv_reader = csv.reader(csv_file) rows = [row for row in csv_reader] # csv表格中每一行都在rows中了 # 每一个row结束之后进行merge for row_no in range(len(rows)): index = [] # 倒排索引表是一个列表 row = rows[row_no] # row是csv表格中的一行 for string in row: # string是csv表格中的一项 words = language.text_process(string) words = language.get_dictionary_list(words) # 完成分词 # 把csv表格中一项的所有词项写入词典,并建立索引表 for word in words: dic.add(word) word_ptr = dic.get_position(word) if word_ptr not in index: # 若倒排索引表中没有这个词项 index.append(word_ptr) index = sorted(index) # 把index里面的term_ptr转为字符串,这一步是为了后面的join for i in range(len(index)): index[i] = str(index[i]) # 读完每一行就创建一个文件来保存这一行中出现的term指针 f_name = str(row_no) + '.txt' f_name = 'index/' + f_name f = open(f_name, 'w') doc_content = ','.join(index) # 指针列表也不会太大,我到时候直接读一个文件进来也不过分 f.write(doc_content) f.close() # print(index) dic.write2file('dictionary.txt') return index
def creat_word_rel_dict(r_file, *q_files): word_dict = Dictionary() word_dict.add_unk_token() word_dict.add_pad_token() word_dict.add_start_token() for q_file in q_files: qa_data = pickle.load(open(q_file, 'rb')) for data in qa_data: q = data.question tokens = q.split(' ') for token in tokens: word_dict.add(token) print(len(word_dict)) rels = pickle.load(open(r_file, 'rb')) for rel in rels: rel_word = [] w = rel[3:].split('.') for i in w: rel_word.extend(i.split('_')) for word in rel_word: word_dict.add(word) print(len(word_dict)) return word_dict
def load_type_dictionary(filename, word_dict=None): if word_dict is None: word_dict = Dictionary() word_dict.add_unk_token() word_dict.add_pad_token() data = pickle.load(open(filename, 'rb')) for ty in data: word_dict.add(ty) return word_dict
def get_postag_data(config, train_path, dev_path, vocab_path=None, label_path=None): use_se_marker = config.use_se_marker raw_train_sents = get_sentences(train_path, use_se_marker) raw_dev_sents = get_sentences(dev_path, use_se_marker) word_to_embeddings = get_pretrained_embeddings( WORD_EMBEDDINGS[config.word_embedding]) # Prepare word dictionary. word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN) if use_se_marker: word_dict.add_all([START_MARKER, END_MARKER]) if vocab_path != None: with open(vocab_path, 'r') as f_vocab: for line in f_vocab: word_dict.add(line.strip()) f_vocab.close() word_dict.accept_new = False print 'Load {} words. Dictionary freezed.'.format(word_dict.size()) # Parpare label dictionary. label_dict = Dictionary() if label_path != None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) train_sents = [(string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings), string_sequence_to_ids(sent[1], label_dict)) for sent in raw_train_sents] dev_sents = [(string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings), string_sequence_to_ids(sent[1], label_dict)) for sent in raw_dev_sents] print("Extracted {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([len(s[0]) for s in train_sents]))) print("Max development sentence length: {}".format( max([len(s[0]) for s in dev_sents]))) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (train_sents, dev_sents, word_dict, label_dict, [word_embedding], [word_embedding_shape])
def load_word_dictionary(filename, word_dict=None): if word_dict is None: word_dict = Dictionary() word_dict.add_unk_token() word_dict.add_pad_token() with open(filename) as f: for line in f: if not line: break line = line.strip() if not line: continue word_dict.add(line) return word_dict
def load_rel_separated_dictionary(filename): rel1_dict = Dictionary() rel1_dict.add_unk_token() rel1_dict.add_pad_token() rel2_dict = Dictionary() rel2_dict.add_unk_token() rel2_dict.add_pad_token() with open(filename) as f: for line in f: if not line: break line = line.strip() if not line: continue line = line.split('.') rel1 = '.'.join(line[:-1]) rel2 = line[-1] rel1_dict.add(rel1) rel2_dict.add(rel2) return rel1_dict, rel2_dict
def get_srl_data(config, train_data_path, dev_data_path, vocab_path=None, label_path=None): ''' ''' use_se_marker = config.use_se_marker raw_train_sents = get_srl_sentences(train_data_path, use_se_marker) raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker) word_to_embeddings = get_pretrained_embeddings( WORD_EMBEDDINGS[config.word_embedding]) # get pre-trained embeddings # Prepare word dictionary. word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) if use_se_marker: word_dict.add_all([START_MARKER, END_MARKER]) if vocab_path != None: with open(vocab_path, 'r') as f_vocab: for line in f_vocab: word_dict.add(line.strip()) f_vocab.close() word_dict.accept_new = False print 'Load {} words. Dictionary freezed.'.format(word_dict.size()) # Parpare label dictionary. label_dict = Dictionary() if label_path != None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) # Get tokens and labels: [sentence_id, word, predicate, label] train_sentences_ids = [sent[0] for sent in raw_train_sents] train_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_train_sents ] train_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents ] if label_dict.accept_new: label_dict.set_unknown_token( UNKNOWN_LABEL) # train corpus contains the label 'O' ? label_dict.accept_new = False dev_sentences_ids = [sent[0] for sent in raw_dev_sents] dev_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_dev_sents ] dev_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents ] print 'Total tokens in Dev dataset {}'.format( sum([len(sent[1]) for sent in raw_dev_sents])) # Get features print 'Extracting features' train_features, feature_shapes = features.get_srl_features( raw_train_sents, config) dev_features, feature_shapes2 = features.get_srl_features( raw_dev_sents, config) for f1, f2 in zip(feature_shapes, feature_shapes2): assert f1 == f2 # For additional features. Unused now. feature_dicts = [] for feature in config.features: feature_dicts.append(None) train_sents = [] dev_sents = [] for i in range(len(train_tokens)): train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) + tuple(train_features[i]) + (train_labels[i], )) for i in range(len(dev_tokens)): dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) + tuple(dev_features[i]) + (dev_labels[i], )) print("Extraced {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([len(s[1]) for s in train_sents]))) print("Max development sentence length: {}".format( max([len(s[1]) for s in dev_sents]))) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (train_sents, dev_sents, word_dict, label_dict, [word_embedding, None, None], [word_embedding_shape] + feature_shapes, [ word_dict, ] + feature_dicts)
def get_srl_data(config, train_data_path, dep_path, dev_data_path, vocab_path=None, char_path=None, label_path=None): # Load sentences (documents) from data paths respectively. raw_train_sents = get_srl_sentences(train_data_path) raw_dev_sents = get_srl_sentences(dev_data_path) # Load dev data eval_data = load_eval_data(dev_data_path) # Load pretrained embeddings word_embeddings = get_pretrained_embeddings( config.word_embedding) # get pre-trained embeddings head_embeddings = get_pretrained_embeddings(config.head_embedding) # Prepare word embedding dictionary. word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # Prepare head embedding dictionary. head_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # Prepare char dictionary. char_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) with open(char_path, 'r') as f_char: for line in f_char: char_dict.add(line.strip()) f_char.close() char_dict.accept_new = False print 'Load {} chars, Dictionary freezed.'.format(char_dict.size()) # Parpare SRL label dictionary. label_dict = Dictionary() label_dict.set_unknown_token( NULL_LABEL) # train corpus contains the label 'O' ? if label_path is not None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(NULL_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) # Parpare SRL label dictionary. dep_label_dict = Dictionary() # Training data: Get tokens and labels: [sentence_id, word, predicate, label] train_samples = tokenize_data(raw_train_sents, word_dict, head_dict, char_dict, label_dict, False, word_embeddings, head_embeddings) # Data for dep Trees with Timer("Loading Dependency Trees"): dep_trees = SyntacticCONLL() dep_trees.read_from_file(dep_path, prune_ratio=config.dep_prune_ratio) dep_trees.tokenize_dep_trees(word_dict, char_dict, dep_label_dict, word_embeddings) # set dictionary freezed char_dict.accept_new, label_dict.accept_new, dep_label_dict.accept_new = False, False, False # Development data: dev_samples = tokenize_data(raw_dev_sents, word_dict, head_dict, char_dict, label_dict, False, word_embeddings, head_embeddings) # set word and head dict freezed. word_dict.accept_new, head_dict.accept_new = False, False print("Extract {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([s[1] for s in train_samples]))) print("Max development sentence length: {}".format( max([s[1] for s in dev_samples]))) word_embedding = np.asarray( [word_embeddings[w] for w in word_dict.idx2str]) word_embedding_shape = [len(word_embedding), len(word_embedding[0])] head_embedding = np.asarray( [head_embeddings[w] for w in head_dict.idx2str]) head_embedding_shape = [len(head_embedding), len(head_embedding[0])] print("word embedding shape {}, head embedding shape {}".format( word_embedding_shape, head_embedding_shape)) return (train_samples, dev_samples, dep_trees.sample_dep_data, eval_data, word_dict, head_dict, char_dict, label_dict, dep_label_dict, [word_embedding, head_embedding], [word_embedding_shape, head_embedding_shape])
from dictionary import Dictionary rel_dict_path = "../../data/vocab/vocab.rel.pt" # ent_dict = torch.load(ent_dict_path) ent_dict = Dictionary() rel_dict = torch.load(rel_dict_path) dev = pickle.load(open("../relation_prediction/data/new_dev.pkl", "rb")) test = pickle.load(open("../relation_prediction/data/new_test.pkl", "rb")) train = pickle.load(open("../relation_prediction/data/new_train.pkl", "rb")) with open("../../data/embed/valid.txt", "w") as f: for index, row in dev.iterrows(): f.write(row["subject"] + "\t" + row["relation"] + "\t" + row["object"] + "\n") ent_dict.add(row["subject"]) ent_dict.add(row["object"]) rel_dict.add(row["relation"]) with open("../../data/embed/test.txt", "w") as f: for index, row in test.iterrows(): f.write(row["subject"] + "\t" + row["relation"] + "\t" + row["object"] + "\n") ent_dict.add(row["subject"]) ent_dict.add(row["object"]) rel_dict.add(row["relation"]) with open("../../data/embed/train.txt", "w") as f: for index, row in train.iterrows(): f.write(row["subject"] + "\t" + row["relation"] + "\t" + row["object"] + "\n")
'Original Subject Name': row['subject_name'], 'Normalized Edit Distance': normalized_edit_distance, 'Question Tokens': row['question_tokens'], }) exact_match = [ d for d in print_data if d['Normalized Edit Distance'] == 1.0 ] # get the word dictionary word_vocab = Dictionary() word_vocab.add_unk_token() word_vocab.add_pad_token() word_vocab.add_start_token() word_vocab.add_end_token() word_vocab.add("<e>") add_word(df_dev) add_word(df_test) add_word(df_train) torch.save(word_vocab, "../../data/vocab/word_vocab.pt") # get the training data and test data get_formatted_examples(128, '../../data/subject_recognition/dev.pt', df_dev) get_formatted_examples(128, '../../data/subject_recognition/test.pt', df_test) get_formatted_examples(128, '../../data/subject_recognition/train.pt', df_train)
class Graph: def __init__(self): self.vertices = [] self.adjList = Dictionary() def addVertex(self, v): self.vertices.append(v) self.adjList.add(v, []) def addEdge(self, v, w): self.adjList.get(v).append(w) self.adjList.get(w).append(v) def toString(self): stx = '' for i in range(0, len(self.vertices)): stx += self.vertices[i] + ' => ' neighbors = self.adjList.get(self.vertices[i]) for j in range(0, len(neighbors)): stx += neighbors[j] + ' ' stx += '\n' print(stx) return stx # Breadth-First Search, BFS def bfs(self, v, callback): # Previous setting d = {} pred = {} color = {} for i in vertices: color[i] = 'white' d[i] = 0 pred[i] = None queue = Queue() queue.enqueue(v) while not queue.isEmpty(): u = queue.dequeue() neighbors = self.adjList.get(u) color[u] = 'grey' for n in neighbors: if color[n] == 'white': color[n] = 'grey' # Count distance and set pred d[n] = d[u] + 1 pred[n] = u queue.enqueue(n) color[u] = 'black' print('distance is =>', d) print('predecessors is =>', pred) return pred # Depth-First Search, DFS def dfs(self, df_u): df_color = {} d = {} # discovered time of vertices f = {} # completed time of vertices p = {} # predecessor of vertices time = 0 for i in vertices: df_color[i] = 'white' for i in vertices: f[i] = 0 d[i] = 0 p[i] = None self.dfsVisit(df_u, df_color, d, f, p, time) print('discovery is =>', d) print('finished is =>', f) print('predecessor is =>', p) def dfsVisit(self, u, color, d, f, p, time): print('discovered => ', u) color[u] = 'grey' time += 1 d[u] = time neighbors = self.adjList.get(u) for i in range(0, len(neighbors)): w = neighbors[i] if color[w] == 'white': p[w] = u self.dfsVisit(w, color, d, f, p, time) color[u] = 'black' time += 1 f[u] = time print('explored => ', u)
class TestDictionary(unittest.TestCase): def setUp(self): self.d = Dictionary() def test_010_is_empty_when_created(self): self.assertEqual(self.d.entries(), {}) def test_020_can_add_whole_entries_with_keyword_and_definition(self): self.d.add('fish', 'aquatic animal') self.assertEqual(self.d.entries(), {'fish': 'aquatic animal'}) self.assertEqual(self.d.keywords(), ['fish']) def test_030_add_keywords_without_definition(self): self.d.add('fish') self.assertEqual(self.d.entries(), {'fish': None}) self.assertEqual(self.d.keywords(), ['fish']) def test_040_can_check_whether_a_given_keyword_exists(self): self.assertFalse(self.d.includes('fish')) def test_050_doesnt_cheat_when_checking_whether_keyword_exists(self): self.assertFalse(self.d.includes('fish')) self.d.add('fish') self.assertTrue(self.d.includes('fish')) self.assertFalse(self.d.includes('bird')) def test_060_doesnt_include_a_prefix_in_and_of_itself(self): self.d.add('fish') self.assertFalse(self.d.includes('fi')) def test_070_doesnt_find_a_word_in_an_empty_dictionary(self): self.assertFalse(self.d.includes('fi')) def test_080_finds_nothing_if_the_prefix_matches_nothing(self): self.d.add('fiend') self.d.add('great') self.assertEqual(len(self.d.find('nothing')), 0) def test_090_finds_an_entry(self): self.d.add('fish', 'aquatic animal') self.assertEqual(self.d.find('fish'), {'fish': 'aquatic animal'}) def test_100_finds_multiple_matches_from_a_prefix_and_returns_the_entire_entry( self): self.d.add('fish', 'aquatic animal') self.d.add('fiend', 'wicked person') self.d.add('great', 'remarkable') self.assertEqual(self.d.find('fi'), { 'fish': 'aquatic animal', 'fiend': 'wicked person' }) def test_110_lists_keywords_alphabetically(self): self.d.add('zebra', 'African land animal with stripes') self.d.add('fish', 'aquatic animal') self.d.add('apple', 'fruit') self.assertEqual(self.d.keywords(), ['apple', 'fish', 'zebra']) def test_120_can_produce_printable_output(self): self.d.add('zebra', 'African land animal with stripes') self.d.add('fish', 'aquatic animal') self.d.add('apple', 'fruit') should_str = """[apple] \"fruit\" [fish] \"aquatic animal\" [zebra] \"African land animal with stripes\"""" self.assertEqual(self.d.printable(), should_str)