Esempio n. 1
0
def generate_unicode_categories(outputfilename):

    raw_text_dir = read.read_from_json('raw_data_dir')
    unicatedict = read.read_from_json("unicatedict")
    data_size = len(raw_text_dir)

    f = h5py.File("data/" + outputfilename + ".hdf5", "w")
    max_len_text = read.get_char2id_dict(raw_text_dir)
    dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8')

    text_unicate_dict = dict()

    for data_id in range(data_size):
        raw_text = read.read_from_dir(raw_text_dir[data_id])
        text_inputs = [[
            unicatedict[unicodedata.category(char.decode("utf-8"))]
            for char in raw_text
        ]]
        text_unicate_dict[raw_text_dir[data_id]] = text_inputs[0]
        data_x = pad_sequences(text_inputs,
                               dtype='int8',
                               maxlen=max_len_text,
                               padding="post")
        dset[data_id, :] = data_x[0]
    read.save_in_json("text_unicate_dict", text_unicate_dict)
def generate_vocabulary(start=0, end=1):
    """
    Using pre-defined gazetteer to label each character in sentences. (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    import vocabulary

    vocab_dict = vocabulary.get_vocab_dict()
    n_vocab = max(map(int, vocab_dict.keys())) - 1
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    text_vocab_dict = dict()

    for data_id in range(start, end):
        sentences_spans = read.read_from_json("training_sentence/sentences/" +
                                              raw_dir_simple[data_id])
        vocab_sentences = list()
        for sent in sentences_spans:
            a = np.ones(len(sent[0]))
            a = a.tolist()

            for index in range(n_vocab):
                vocab = vocab_dict[str(index + 2)]
                time_terms = re.compile('|'.join(vocab), re.IGNORECASE)
                for m in time_terms.finditer(sent[0]):
                    print len(sent[0])
                    print m.span()
                    for posi in range(m.span()[0], m.span()[1]):
                        a[posi] = index + 2
            vocab_sentences.append(a)
        text_vocab_dict[raw_dir_simple[data_id]] = vocab_sentences
    read.save_in_json("training_sentence/vocab/text_vocab_dict_normalized",
                      text_vocab_dict)
def generate_unicodecate(start=0, end=63):
    """
    generate unicode category for each character in sentences.  (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    unicatedict = read.read_from_json("unicatedict")  #### in folder data/

    text_unicode_dict = dict()

    for data_id in range(start, end):
        sentences_spans = read.read_from_json("training_sentence/sentences/" +
                                              raw_dir_simple[data_id])
        unicate_sentences = list()
        for sent in sentences_spans:
            unicate_sentences.append([
                unicatedict[unicodedata.category(char.decode("utf-8"))]
                for char in sent[0]
            ])
        print unicate_sentences
        text_unicode_dict[raw_dir_simple[data_id]] = unicate_sentences
    read.save_in_json(
        "training_sentence/unicode_category/text_unicode_category_dict_normalized",
        text_unicode_dict)
def split_by_sentence(start=0, end=63):
    """
    Split the document into sentence.    (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    raw_text_dir = read.read_from_json('raw_data_dir')  #### in folder data/
    raw_dir_simple = read.read_from_json(
        'raw_dir_simple')  #### in folder data/
    for data_id in range(start, end):
        raw_text = read.read_from_dir(raw_text_dir[data_id])
        sent_tokenize_list = sent_tokenize(raw_text)
        sent_tokenize_span_list = spans(sent_tokenize_list, raw_text)

        sent_span_list = list()
        for sent_tokenize_span in sent_tokenize_span_list:
            sent_spans = list(
                regexp_span_tokenize(sent_tokenize_span[0], r'\n'))
            for sent_span in sent_spans:
                sent_span = (sent_span[0] + sent_tokenize_span[1],
                             sent_span[1] + sent_tokenize_span[1])
                sent_span_list.append((raw_text[sent_span[0]:sent_span[1]],
                                       sent_span[0], sent_span[1]))
        read.save_in_json(
            "training_sentence/sentences/" + raw_dir_simple[data_id],
            sent_span_list)
def get_pos(start, end):
    # raw_dir_simple = read.read_from_json('raw_dir_simple')
    # max_len = 0  # 106
    # pos_tag_vocab = defaultdict(float)
    # for data_id in range(start, end):
    #     pos_tags = []
    #     sent_spans = read.read_from_json("training_sentence/word_level_sentence/" + raw_dir_simple[data_id])
    #     for [sent,span] in sent_spans:
    #         pos_tag = pos_tagger(sent)
    #         pos_tag = [item[1] for item in pos_tag]
    #         for tag in pos_tag:
    #             pos_tag_vocab[tag]+=1
    #
    #         pos_tags.append(pos_tag)
    #
    #     read.save_in_json("training_sentence/word_level_postag/" + raw_dir_simple[data_id], pos_tags)

    #read.save_in_json("training_sentence/pos_vocab_word",pos_tag_vocab)
    pos_tag_vocab = read.read_from_json("training_sentence/pos_vocab_word")

    pos_tag_vocab['eof'] = 1

    pos_idx_map = dict()
    i = 1
    for word in pos_tag_vocab:
        pos_idx_map[word] = i
        i += 1

    read.save_in_json("training_sentence/pos_id", pos_idx_map)
def get_word_from_sentence(start, end):
    wordnet_lemmatizer = WordNetLemmatizer()
    raw_text_dir = read.read_from_json('raw_data_dir')
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    for data_id in range(start, end):
        word_level_chunk = list()
        word_level_chunk_lemma = list()
        sentences = read.read_from_json("training_sentence/sentences/" +
                                        raw_dir_simple[data_id])
        for sent in sentences:

            tokens = list()
            tokens_lemma = list()
            tokens_spans = list()
            for token in spans(sent[0]):
                print token[0]
                tokens_lemma.append(
                    wordnet_lemmatizer.lemmatize(token[0].lower()))
                tokens.append(token[0])
                tokens_spans.append((sent[1] + token[1], sent[1] + token[2]))
            word_level_chunk.append([tokens, tokens_spans])
            word_level_chunk_lemma.append([tokens_lemma, tokens_spans])
        read.save_in_json(
            "training_sentence/word_level_sentence/" + raw_dir_simple[data_id],
            word_level_chunk)
        read.save_in_json(
            "training_sentence/word_level_sentence_lemma/" +
            raw_dir_simple[data_id], word_level_chunk_lemma)
def get_vocabulary(start, end, lemma):

    #raw_text_dir = read.read_from_json('raw_data_dir')

    raw_dir_simple = read.read_from_json('raw_dir_simple')
    vocab = defaultdict(float)
    for data_id in range(start, end):
        sent_spans = read.read_from_json(
            "training_sentence/word_level_sentence" + lemma + "/" +
            raw_dir_simple[data_id])
        for sent_index in range(len(sent_spans)):
            for word_index in range(len(sent_spans[sent_index][0])):
                vocab[sent_spans[sent_index][0][word_index]] += 1

    read.save_in_json("training_sentence/vocab_word" + lemma, vocab)

    vocab["\n"] += 1

    word_idx_map = dict()
    i = 1
    for word in vocab:
        word_idx_map[word] = i
        i += 1

    read.save_in_json("training_sentence/word_id" + lemma, word_idx_map)
Esempio n. 8
0
def generate_vocab_match(outputfilename):
    vocab_dict = get_vocab_dict()
    n_vocab = max(map(int, vocab_dict.keys())) - 1

    #print vocab
    # time_terms = re.compile('|'.join(vocab), re.IGNORECASE)

    raw_text_dir = read.read_from_json('raw_data_dir')
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    data_size = len(raw_text_dir)
    text_length = read.read_from_json('texts_length')

    f = h5py.File("data/" + outputfilename + ".hdf5", "w")
    max_len_text = read.get_char2id_dict(raw_text_dir)
    dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8')
    text_vocab_dict = dict()

    for data_id in range(data_size):
        raw_text = read.read_from_dir(raw_text_dir[data_id])
        a = np.ones(text_length[data_id])
        for index in range(n_vocab):
            vocab = vocab_dict[str(index + 2)]
            time_terms = re.compile('|'.join(vocab), re.IGNORECASE)
            for m in time_terms.finditer(raw_text):
                a[m.span()[0]:m.span()[1]] = index + 2

        text_vocab_dict[raw_dir_simple[data_id]] = a.tolist()
        data_x = pad_sequences([a.tolist()],
                               dtype='int8',
                               maxlen=max_len_text,
                               padding="post")

        dset[data_id, :] = data_x[0]
    read.save_in_json("text_vocab_dict", text_vocab_dict)
def generate_character_pos():
    """
    Transofrom word-level POS tag to Character-level POS tag . (needed to build end2end system)
    :return:
    """

    start = 0
    end = 63
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    text_pos_text_dict = dict()

    for data_id in range(start, end):
        sentences_spans = read.read_from_json("training_sentence/sentences/" +
                                              raw_dir_simple[data_id])
        pos_lists = read.read_from_json("training_sentence/pos/" +
                                        raw_dir_simple[data_id])
        pos_sentences = list()
        for sent_index in range(len(pos_lists)):
            postag = list()
            token_index = 0
            term = ""
            for char in sentences_spans[sent_index][0]:
                # if term =="leade":
                #     print "ok"
                if char == ' ':
                    term = ""
                    postag.append("null")
                else:
                    term += char
                    if term in pos_lists[sent_index][token_index][0] and len(
                            term) < len(pos_lists[sent_index][token_index][0]):
                        if bool(re.compile(r'[/\:\-]').match(char)):
                            if len(term) == 1:
                                postag.append(
                                    pos_lists[sent_index][token_index][1])
                            else:
                                postag.append('Sep')
                        else:
                            postag.append(
                                pos_lists[sent_index][token_index][1])
                    elif term in pos_lists[sent_index][token_index][0] and len(
                            term) == len(
                                pos_lists[sent_index][token_index][0]):
                        # if pos[index][token_index][1] =="CD" and bool(re.compile(r'[/\:\-]').match(char)):
                        #     postag.append('Sep')
                        # else:
                        postag.append(pos_lists[sent_index][token_index][1])
                        token_index += 1
                        term = ""
                        if token_index == len(pos_lists[sent_index]):
                            print postag
                            pos_sentences.append(postag)
        text_pos_text_dict[raw_dir_simple[data_id]] = pos_sentences
    read.save_in_json("training_sentence/pos/text_pos_text_dict_normalized",
                      text_pos_text_dict)
Esempio n. 10
0
def char2int_unicate2int():
    char2int = read.read_from_json('char2int')
    print char2int
    del char2int[u'empty']
    unicatelist_new = list()
    unicatedict = dict()
    unicatelist = list()
    for key, item in char2int.items():
        unicatelist.append(unicodedata.category(key))
    unicatelist_new = list(enumerate(set(unicatelist), start=1))
    for cate in unicatelist_new:
        unicatedict[cate[1]] = cate[0]
    read.save_in_json("unicatedict", unicatedict)
def get_word_tag(start, end):
    multi_labels = read.textfile2list("data/label/multi-hot.txt")
    multi_hot = read.counterList2Dict(list(enumerate(multi_labels, 1)))
    multi_hot = {y: x for x, y in multi_hot.iteritems()}

    raw_text_dir = read.read_from_json('raw_data_dir')
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    max_len = 0  #  106
    for data_id in range(start, end):
        xml_tags = read.read_from_json("training_sentence/xml_tags/" +
                                       raw_dir_simple[data_id])
        sent_spans = read.read_from_json(
            "training_sentence/word_level_sentence/" + raw_dir_simple[data_id])
        word_level_tags = list()
        for sent_index in range(len(sent_spans)):
            tags = list()
            for word_index in range(len(sent_spans[sent_index][0])):
                if len(xml_tags[sent_index]) == 0:
                    tags.append(0)
                elif sent_spans[sent_index][1][word_index][0] == int(
                        xml_tags[sent_index][0]
                    [0]) and sent_spans[sent_index][1][word_index][1] == int(
                        xml_tags[sent_index][0][1][0]):
                    xml_tag = extract_tag(xml_tags[sent_index][0])
                    intersection = [x for x in xml_tag if x in multi_labels]
                    if len(intersection) > 0:
                        tags.append(multi_hot[intersection[0]])
                    xml_tags[sent_index].pop(0)
                elif sent_spans[sent_index][1][word_index][1] < int(
                        xml_tags[sent_index][0][0]):
                    tags.append(0)
                else:
                    tags.append(0)
                    while len(xml_tags[sent_index]) > 0 and int(
                            xml_tags[sent_index][0][1][0]) <= int(
                                sent_spans[sent_index][1][word_index][1]):
                        xml_tags[sent_index].pop(0)

            word_level_tags.append(tags)

            max_len = max(len(tags), max_len)
        print max_len
        read.save_in_json(
            "training_sentence/word_level_sentence_tag/" +
            raw_dir_simple[data_id], word_level_tags)
def sentence_labeling(start=0, end=63):
    """
    Transform the document-level label into sentence label.
    :param start:
    :param end:
    :return:
    """

    raw_dir_simple = read.read_from_json('raw_dir_simple')
    xmltags = read.read_from_json('xmltags_deleted_others')

    for data_id in range(start, end):
        tag_list = list()

        tag_span = xmltags[data_id].keys()
        tag_span = sorted(tag_span, key=int)
        print tag_span
        print raw_dir_simple[data_id]
        sentences = read.read_from_json("training_sentence/sentences/" +
                                        raw_dir_simple[data_id])
        i = 0
        for sent in sentences:
            tag = list()
            if i < len(tag_span):
                if sent[2] < int(tag_span[i]):
                    tag_list.append(tag)
                elif sent[1] <= int(tag_span[i]) and sent[2] > int(
                        tag_span[i]):
                    while True:
                        tag.append(
                            (tag_span[i], xmltags[data_id][tag_span[i]]))
                        i = i + 1
                        if i < len(tag_span):
                            if int(tag_span[i]) > sent[2]:
                                tag_list.append(tag)
                                break
                        else:
                            tag_list.append(tag)
                            break
            else:
                tag_list.append(tag)

        read.save_in_json(
            "training_sentence/xml_tags/" + raw_dir_simple[data_id], tag_list)
def pos_sentence(start=0, end=63):
    """
    Get POS tags for each sentence. (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    raw_dir_simple = read.read_from_json(
        'raw_dir_simple')  #### in folder data/
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',  #### in folder data/
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )  #### in folder data/
    english_postagger.java_options = '-mx4096m'

    pos = list()

    for data_id in range(start, end):
        sentences_spans = read.read_from_json("training_sentence/sentences/" +
                                              raw_dir_simple[data_id])
        print raw_dir_simple[data_id]
        pos_sentences = list()
        for sent_span in sentences_spans:
            print sent_span[0]
            text = nltk.word_tokenize(sent_span[0])
            k = english_postagger.tag(
                text
            )  #####StanfordPnOSTagger failed to tag the underscore, see ttps://github.com/nltk/nltk/issues/1632  if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py into "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues
            index = 0

            for token in k:
                if (text[index] != token[0]) and (
                        token[0] == '``' or token[0] == "''"
                ):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
                    k[index] = ["\"", "\'\'"]
                if token[1] not in pos:
                    pos.append(token[1])
                index += 1
            pos_sentences.append(k)

        read.save_in_json("training_sentence/pos/" + raw_dir_simple[data_id],
                          pos_sentences)
    read.save_in_json("training_sentence/pos/pos_tag", pos)
Esempio n. 14
0
def generate_pos(start=0, end=63):
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )
    english_postagger.java_options = '-mx4096m'
    raw_text_dir = read.read_from_json('raw_data_dir')
    data_size = len(raw_text_dir)
    pos = list()

    for data_id in range(start, end):

        raw_text = read.read_from_dir(raw_text_dir[data_id])
        print raw_text_dir[data_id]
        contents = list()
        for line in raw_text.splitlines():
            print line
            text = nltk.word_tokenize(line)
            print text
            if len(text) == 0:
                k = []
            else:
                k = english_postagger.tag(text)
                index = 0
                for token in k:
                    if (text[index] != token[0]) and (
                            token[0] == '``' or token[0] == "''"
                    ):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
                        k[index] = ["\"", "\'\'"]
                    if token[1] not in pos:
                        pos.append(token[1])
                    index += 1
            contents.append(k)

        read.save_json("data/pos/" + raw_text_dir[data_id].rsplit('\\', 1)[1],
                       contents)
    read.save_in_json("pos_tag", pos)