Example #1
0
def generate_unicode_categories(outputfilename):

    raw_text_dir = read.read_from_json('raw_data_dir')
    unicatedict = read.read_from_json("unicatedict")
    data_size = len(raw_text_dir)

    f = h5py.File("data/" + outputfilename + ".hdf5", "w")
    max_len_text = read.get_char2id_dict(raw_text_dir)
    dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8')

    text_unicate_dict = dict()

    for data_id in range(data_size):
        raw_text = read.read_from_dir(raw_text_dir[data_id])
        text_inputs = [[
            unicatedict[unicodedata.category(char.decode("utf-8"))]
            for char in raw_text
        ]]
        text_unicate_dict[raw_text_dir[data_id]] = text_inputs[0]
        data_x = pad_sequences(text_inputs,
                               dtype='int8',
                               maxlen=max_len_text,
                               padding="post")
        dset[data_id, :] = data_x[0]
    read.save_in_json("text_unicate_dict", text_unicate_dict)
def split_by_sentence(start=0, end=63):
    """
    Split the document into sentence.    (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    raw_text_dir = read.read_from_json('raw_data_dir')  #### in folder data/
    raw_dir_simple = read.read_from_json(
        'raw_dir_simple')  #### in folder data/
    for data_id in range(start, end):
        raw_text = read.read_from_dir(raw_text_dir[data_id])
        sent_tokenize_list = sent_tokenize(raw_text)
        sent_tokenize_span_list = spans(sent_tokenize_list, raw_text)

        sent_span_list = list()
        for sent_tokenize_span in sent_tokenize_span_list:
            sent_spans = list(
                regexp_span_tokenize(sent_tokenize_span[0], r'\n'))
            for sent_span in sent_spans:
                sent_span = (sent_span[0] + sent_tokenize_span[1],
                             sent_span[1] + sent_tokenize_span[1])
                sent_span_list.append((raw_text[sent_span[0]:sent_span[1]],
                                       sent_span[0], sent_span[1]))
        read.save_in_json(
            "training_sentence/sentences/" + raw_dir_simple[data_id],
            sent_span_list)
Example #3
0
def generate_vocab_match(outputfilename):
    vocab_dict = get_vocab_dict()
    n_vocab = max(map(int, vocab_dict.keys())) - 1

    #print vocab
    # time_terms = re.compile('|'.join(vocab), re.IGNORECASE)

    raw_text_dir = read.read_from_json('raw_data_dir')
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    data_size = len(raw_text_dir)
    text_length = read.read_from_json('texts_length')

    f = h5py.File("data/" + outputfilename + ".hdf5", "w")
    max_len_text = read.get_char2id_dict(raw_text_dir)
    dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8')
    text_vocab_dict = dict()

    for data_id in range(data_size):
        raw_text = read.read_from_dir(raw_text_dir[data_id])
        a = np.ones(text_length[data_id])
        for index in range(n_vocab):
            vocab = vocab_dict[str(index + 2)]
            time_terms = re.compile('|'.join(vocab), re.IGNORECASE)
            for m in time_terms.finditer(raw_text):
                a[m.span()[0]:m.span()[1]] = index + 2

        text_vocab_dict[raw_dir_simple[data_id]] = a.tolist()
        data_x = pad_sequences([a.tolist()],
                               dtype='int8',
                               maxlen=max_len_text,
                               padding="post")

        dset[data_id, :] = data_x[0]
    read.save_in_json("text_vocab_dict", text_vocab_dict)
Example #4
0
def get_vocab_dict():
    data = read.read_from_dir("data/vocab/vocab2.txt")
    vocab_dict = dict()
    for line in data.splitlines():
        items = line.split()
        if vocab_dict.has_key(items[1]):
            vocab_dict[items[1]].append(items[0])
        else:
            values = [items[0]]
            vocab_dict[items[1]] = values

    return vocab_dict
def generate_pos(start=0, end=63):
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )
    english_postagger.java_options = '-mx4096m'
    raw_text_dir = read.read_from_json('raw_data_dir')
    data_size = len(raw_text_dir)
    pos = list()

    for data_id in range(start, end):

        raw_text = read.read_from_dir(raw_text_dir[data_id])
        print raw_text_dir[data_id]
        contents = list()
        for line in raw_text.splitlines():
            print line
            text = nltk.word_tokenize(line)
            print text
            if len(text) == 0:
                k = []
            else:
                k = english_postagger.tag(text)
                index = 0
                for token in k:
                    if (text[index] != token[0]) and (
                            token[0] == '``' or token[0] == "''"
                    ):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
                        k[index] = ["\"", "\'\'"]
                    if token[1] not in pos:
                        pos.append(token[1])
                    index += 1
            contents.append(k)

        read.save_json("data/pos/" + raw_text_dir[data_id].rsplit('\\', 1)[1],
                       contents)
    read.save_in_json("pos_tag", pos)
start = 0
end = 63
raw_text_dir = read.read_from_json('raw_data_dir')
raw_dir_simple = read.read_from_json('raw_dir_simple')

# data_size = len(raw_text_dir)
max_len_text = read.get_char2id_dict(raw_text_dir)
char2int = read.read_from_json('char2int')
int2char = dict((int, char) for char, int in char2int.items())

text_pos_text_dict = dict()
for data_id in range(start, end):
    print raw_dir_simple[data_id]
    pos = read.read_json("data/pos/" + raw_dir_simple[data_id])
    raw_text = read.read_from_dir(raw_text_dir[data_id])

    text_inputs = [[char2int[char] for char in raw_text]]
    postag = list()
    index = 0
    for line in raw_text.splitlines():
        if len(line) == 0:
            postag.append('\n')
            index += 1
        else:
            token_index = 0
            term = ""
            for char in line:
                # if term =="leade":
                #     print "ok"
                if char == ' ':