def generate_unicode_categories(outputfilename): raw_text_dir = read.read_from_json('raw_data_dir') unicatedict = read.read_from_json("unicatedict") data_size = len(raw_text_dir) f = h5py.File("data/" + outputfilename + ".hdf5", "w") max_len_text = read.get_char2id_dict(raw_text_dir) dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8') text_unicate_dict = dict() for data_id in range(data_size): raw_text = read.read_from_dir(raw_text_dir[data_id]) text_inputs = [[ unicatedict[unicodedata.category(char.decode("utf-8"))] for char in raw_text ]] text_unicate_dict[raw_text_dir[data_id]] = text_inputs[0] data_x = pad_sequences(text_inputs, dtype='int8', maxlen=max_len_text, padding="post") dset[data_id, :] = data_x[0] read.save_in_json("text_unicate_dict", text_unicate_dict)
def generate_vocabulary(start=0, end=1): """ Using pre-defined gazetteer to label each character in sentences. (needed to build end2end system) :param start: :param end: :return: """ import vocabulary vocab_dict = vocabulary.get_vocab_dict() n_vocab = max(map(int, vocab_dict.keys())) - 1 raw_dir_simple = read.read_from_json('raw_dir_simple') text_vocab_dict = dict() for data_id in range(start, end): sentences_spans = read.read_from_json("training_sentence/sentences/" + raw_dir_simple[data_id]) vocab_sentences = list() for sent in sentences_spans: a = np.ones(len(sent[0])) a = a.tolist() for index in range(n_vocab): vocab = vocab_dict[str(index + 2)] time_terms = re.compile('|'.join(vocab), re.IGNORECASE) for m in time_terms.finditer(sent[0]): print len(sent[0]) print m.span() for posi in range(m.span()[0], m.span()[1]): a[posi] = index + 2 vocab_sentences.append(a) text_vocab_dict[raw_dir_simple[data_id]] = vocab_sentences read.save_in_json("training_sentence/vocab/text_vocab_dict_normalized", text_vocab_dict)
def generate_unicodecate(start=0, end=63): """ generate unicode category for each character in sentences. (needed to build end2end system) :param start: :param end: :return: """ raw_dir_simple = read.read_from_json('raw_dir_simple') unicatedict = read.read_from_json("unicatedict") #### in folder data/ text_unicode_dict = dict() for data_id in range(start, end): sentences_spans = read.read_from_json("training_sentence/sentences/" + raw_dir_simple[data_id]) unicate_sentences = list() for sent in sentences_spans: unicate_sentences.append([ unicatedict[unicodedata.category(char.decode("utf-8"))] for char in sent[0] ]) print unicate_sentences text_unicode_dict[raw_dir_simple[data_id]] = unicate_sentences read.save_in_json( "training_sentence/unicode_category/text_unicode_category_dict_normalized", text_unicode_dict)
def split_by_sentence(start=0, end=63): """ Split the document into sentence. (needed to build end2end system) :param start: :param end: :return: """ raw_text_dir = read.read_from_json('raw_data_dir') #### in folder data/ raw_dir_simple = read.read_from_json( 'raw_dir_simple') #### in folder data/ for data_id in range(start, end): raw_text = read.read_from_dir(raw_text_dir[data_id]) sent_tokenize_list = sent_tokenize(raw_text) sent_tokenize_span_list = spans(sent_tokenize_list, raw_text) sent_span_list = list() for sent_tokenize_span in sent_tokenize_span_list: sent_spans = list( regexp_span_tokenize(sent_tokenize_span[0], r'\n')) for sent_span in sent_spans: sent_span = (sent_span[0] + sent_tokenize_span[1], sent_span[1] + sent_tokenize_span[1]) sent_span_list.append((raw_text[sent_span[0]:sent_span[1]], sent_span[0], sent_span[1])) read.save_in_json( "training_sentence/sentences/" + raw_dir_simple[data_id], sent_span_list)
def get_pos(start, end): # raw_dir_simple = read.read_from_json('raw_dir_simple') # max_len = 0 # 106 # pos_tag_vocab = defaultdict(float) # for data_id in range(start, end): # pos_tags = [] # sent_spans = read.read_from_json("training_sentence/word_level_sentence/" + raw_dir_simple[data_id]) # for [sent,span] in sent_spans: # pos_tag = pos_tagger(sent) # pos_tag = [item[1] for item in pos_tag] # for tag in pos_tag: # pos_tag_vocab[tag]+=1 # # pos_tags.append(pos_tag) # # read.save_in_json("training_sentence/word_level_postag/" + raw_dir_simple[data_id], pos_tags) #read.save_in_json("training_sentence/pos_vocab_word",pos_tag_vocab) pos_tag_vocab = read.read_from_json("training_sentence/pos_vocab_word") pos_tag_vocab['eof'] = 1 pos_idx_map = dict() i = 1 for word in pos_tag_vocab: pos_idx_map[word] = i i += 1 read.save_in_json("training_sentence/pos_id", pos_idx_map)
def get_word_from_sentence(start, end): wordnet_lemmatizer = WordNetLemmatizer() raw_text_dir = read.read_from_json('raw_data_dir') raw_dir_simple = read.read_from_json('raw_dir_simple') for data_id in range(start, end): word_level_chunk = list() word_level_chunk_lemma = list() sentences = read.read_from_json("training_sentence/sentences/" + raw_dir_simple[data_id]) for sent in sentences: tokens = list() tokens_lemma = list() tokens_spans = list() for token in spans(sent[0]): print token[0] tokens_lemma.append( wordnet_lemmatizer.lemmatize(token[0].lower())) tokens.append(token[0]) tokens_spans.append((sent[1] + token[1], sent[1] + token[2])) word_level_chunk.append([tokens, tokens_spans]) word_level_chunk_lemma.append([tokens_lemma, tokens_spans]) read.save_in_json( "training_sentence/word_level_sentence/" + raw_dir_simple[data_id], word_level_chunk) read.save_in_json( "training_sentence/word_level_sentence_lemma/" + raw_dir_simple[data_id], word_level_chunk_lemma)
def get_vocabulary(start, end, lemma): #raw_text_dir = read.read_from_json('raw_data_dir') raw_dir_simple = read.read_from_json('raw_dir_simple') vocab = defaultdict(float) for data_id in range(start, end): sent_spans = read.read_from_json( "training_sentence/word_level_sentence" + lemma + "/" + raw_dir_simple[data_id]) for sent_index in range(len(sent_spans)): for word_index in range(len(sent_spans[sent_index][0])): vocab[sent_spans[sent_index][0][word_index]] += 1 read.save_in_json("training_sentence/vocab_word" + lemma, vocab) vocab["\n"] += 1 word_idx_map = dict() i = 1 for word in vocab: word_idx_map[word] = i i += 1 read.save_in_json("training_sentence/word_id" + lemma, word_idx_map)
def generate_vocab_match(outputfilename): vocab_dict = get_vocab_dict() n_vocab = max(map(int, vocab_dict.keys())) - 1 #print vocab # time_terms = re.compile('|'.join(vocab), re.IGNORECASE) raw_text_dir = read.read_from_json('raw_data_dir') raw_dir_simple = read.read_from_json('raw_dir_simple') data_size = len(raw_text_dir) text_length = read.read_from_json('texts_length') f = h5py.File("data/" + outputfilename + ".hdf5", "w") max_len_text = read.get_char2id_dict(raw_text_dir) dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8') text_vocab_dict = dict() for data_id in range(data_size): raw_text = read.read_from_dir(raw_text_dir[data_id]) a = np.ones(text_length[data_id]) for index in range(n_vocab): vocab = vocab_dict[str(index + 2)] time_terms = re.compile('|'.join(vocab), re.IGNORECASE) for m in time_terms.finditer(raw_text): a[m.span()[0]:m.span()[1]] = index + 2 text_vocab_dict[raw_dir_simple[data_id]] = a.tolist() data_x = pad_sequences([a.tolist()], dtype='int8', maxlen=max_len_text, padding="post") dset[data_id, :] = data_x[0] read.save_in_json("text_vocab_dict", text_vocab_dict)
def generate_character_pos(): """ Transofrom word-level POS tag to Character-level POS tag . (needed to build end2end system) :return: """ start = 0 end = 63 raw_dir_simple = read.read_from_json('raw_dir_simple') text_pos_text_dict = dict() for data_id in range(start, end): sentences_spans = read.read_from_json("training_sentence/sentences/" + raw_dir_simple[data_id]) pos_lists = read.read_from_json("training_sentence/pos/" + raw_dir_simple[data_id]) pos_sentences = list() for sent_index in range(len(pos_lists)): postag = list() token_index = 0 term = "" for char in sentences_spans[sent_index][0]: # if term =="leade": # print "ok" if char == ' ': term = "" postag.append("null") else: term += char if term in pos_lists[sent_index][token_index][0] and len( term) < len(pos_lists[sent_index][token_index][0]): if bool(re.compile(r'[/\:\-]').match(char)): if len(term) == 1: postag.append( pos_lists[sent_index][token_index][1]) else: postag.append('Sep') else: postag.append( pos_lists[sent_index][token_index][1]) elif term in pos_lists[sent_index][token_index][0] and len( term) == len( pos_lists[sent_index][token_index][0]): # if pos[index][token_index][1] =="CD" and bool(re.compile(r'[/\:\-]').match(char)): # postag.append('Sep') # else: postag.append(pos_lists[sent_index][token_index][1]) token_index += 1 term = "" if token_index == len(pos_lists[sent_index]): print postag pos_sentences.append(postag) text_pos_text_dict[raw_dir_simple[data_id]] = pos_sentences read.save_in_json("training_sentence/pos/text_pos_text_dict_normalized", text_pos_text_dict)
def char2int_unicate2int(): char2int = read.read_from_json('char2int') print char2int del char2int[u'empty'] unicatelist_new = list() unicatedict = dict() unicatelist = list() for key, item in char2int.items(): unicatelist.append(unicodedata.category(key)) unicatelist_new = list(enumerate(set(unicatelist), start=1)) for cate in unicatelist_new: unicatedict[cate[1]] = cate[0] read.save_in_json("unicatedict", unicatedict)
def get_word_tag(start, end): multi_labels = read.textfile2list("data/label/multi-hot.txt") multi_hot = read.counterList2Dict(list(enumerate(multi_labels, 1))) multi_hot = {y: x for x, y in multi_hot.iteritems()} raw_text_dir = read.read_from_json('raw_data_dir') raw_dir_simple = read.read_from_json('raw_dir_simple') max_len = 0 # 106 for data_id in range(start, end): xml_tags = read.read_from_json("training_sentence/xml_tags/" + raw_dir_simple[data_id]) sent_spans = read.read_from_json( "training_sentence/word_level_sentence/" + raw_dir_simple[data_id]) word_level_tags = list() for sent_index in range(len(sent_spans)): tags = list() for word_index in range(len(sent_spans[sent_index][0])): if len(xml_tags[sent_index]) == 0: tags.append(0) elif sent_spans[sent_index][1][word_index][0] == int( xml_tags[sent_index][0] [0]) and sent_spans[sent_index][1][word_index][1] == int( xml_tags[sent_index][0][1][0]): xml_tag = extract_tag(xml_tags[sent_index][0]) intersection = [x for x in xml_tag if x in multi_labels] if len(intersection) > 0: tags.append(multi_hot[intersection[0]]) xml_tags[sent_index].pop(0) elif sent_spans[sent_index][1][word_index][1] < int( xml_tags[sent_index][0][0]): tags.append(0) else: tags.append(0) while len(xml_tags[sent_index]) > 0 and int( xml_tags[sent_index][0][1][0]) <= int( sent_spans[sent_index][1][word_index][1]): xml_tags[sent_index].pop(0) word_level_tags.append(tags) max_len = max(len(tags), max_len) print max_len read.save_in_json( "training_sentence/word_level_sentence_tag/" + raw_dir_simple[data_id], word_level_tags)
def sentence_labeling(start=0, end=63): """ Transform the document-level label into sentence label. :param start: :param end: :return: """ raw_dir_simple = read.read_from_json('raw_dir_simple') xmltags = read.read_from_json('xmltags_deleted_others') for data_id in range(start, end): tag_list = list() tag_span = xmltags[data_id].keys() tag_span = sorted(tag_span, key=int) print tag_span print raw_dir_simple[data_id] sentences = read.read_from_json("training_sentence/sentences/" + raw_dir_simple[data_id]) i = 0 for sent in sentences: tag = list() if i < len(tag_span): if sent[2] < int(tag_span[i]): tag_list.append(tag) elif sent[1] <= int(tag_span[i]) and sent[2] > int( tag_span[i]): while True: tag.append( (tag_span[i], xmltags[data_id][tag_span[i]])) i = i + 1 if i < len(tag_span): if int(tag_span[i]) > sent[2]: tag_list.append(tag) break else: tag_list.append(tag) break else: tag_list.append(tag) read.save_in_json( "training_sentence/xml_tags/" + raw_dir_simple[data_id], tag_list)
def pos_sentence(start=0, end=63): """ Get POS tags for each sentence. (needed to build end2end system) :param start: :param end: :return: """ raw_dir_simple = read.read_from_json( 'raw_dir_simple') #### in folder data/ english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', #### in folder data/ 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) #### in folder data/ english_postagger.java_options = '-mx4096m' pos = list() for data_id in range(start, end): sentences_spans = read.read_from_json("training_sentence/sentences/" + raw_dir_simple[data_id]) print raw_dir_simple[data_id] pos_sentences = list() for sent_span in sentences_spans: print sent_span[0] text = nltk.word_tokenize(sent_span[0]) k = english_postagger.tag( text ) #####StanfordPnOSTagger failed to tag the underscore, see ttps://github.com/nltk/nltk/issues/1632 if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py into "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues index = 0 for token in k: if (text[index] != token[0]) and ( token[0] == '``' or token[0] == "''" ): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) k[index] = ["\"", "\'\'"] if token[1] not in pos: pos.append(token[1]) index += 1 pos_sentences.append(k) read.save_in_json("training_sentence/pos/" + raw_dir_simple[data_id], pos_sentences) read.save_in_json("training_sentence/pos/pos_tag", pos)
def generate_pos(start=0, end=63): english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) english_postagger.java_options = '-mx4096m' raw_text_dir = read.read_from_json('raw_data_dir') data_size = len(raw_text_dir) pos = list() for data_id in range(start, end): raw_text = read.read_from_dir(raw_text_dir[data_id]) print raw_text_dir[data_id] contents = list() for line in raw_text.splitlines(): print line text = nltk.word_tokenize(line) print text if len(text) == 0: k = [] else: k = english_postagger.tag(text) index = 0 for token in k: if (text[index] != token[0]) and ( token[0] == '``' or token[0] == "''" ): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) k[index] = ["\"", "\'\'"] if token[1] not in pos: pos.append(token[1]) index += 1 contents.append(k) read.save_json("data/pos/" + raw_text_dir[data_id].rsplit('\\', 1)[1], contents) read.save_in_json("pos_tag", pos)