Ejemplo n.º 1
0
    def process_batch_preana(batch):
        sequences = []
        for index, paragraph in batch:
            for sentence in paragraph:
                sequence = []
                for token in sentence:
                    sample = Sample()
                    sequence.append(sample)
                    sample.features['token'] = token.form
                    # print(interpretations)
                    sample.features['tags'] = uniq(
                        [form.tags for form in token.interpretations])
                    sample.features['maca_lemmas'] = uniq([
                        (form.lemma, form.tags)
                        for form in token.interpretations
                    ])
                    sample.features['space_before'] = [
                        'space_before'
                    ] if token.space_before else ['no_space_before']

                Preprocess.create_features(sequence)

                if sequence:
                    # print(len(sequence))
                    yield sequence
Ejemplo n.º 2
0
    def process_batch(documents: Iterable[str], maca_config: str, toki_config_path: str) -> Generator[
        List[Sample], None, None]:
        maca_analyzer = MacaAnalyzer(maca_config, toki_config_path)

        for document_id, document in enumerate(documents):
            results = maca_analyzer._maca(document)

            for res in results:
                result = maca_analyzer._parse(res)

                sequence = []
                for form, space_before, interpretations, start, end in result:
                    sample = Sample()
                    sequence.append(sample)
                    sample.features['token'] = form
                    sample.features['tags'] = uniq([t for l, t in interpretations])
                    interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in
                                       interpretations]
                    sample.features['maca_lemmas'] = [(l.replace('_', ' '), t) for l, t in uniq(interpretations)]

                    # TODO: cleanup space before
                    sample.features['space_before'] = ['space_before'] if space_before !='none' else [
                        'no_space_before']
                    sample.features['space_before'].append(space_before)
                    sample.features['start'] = start
                    sample.features['end'] = end
                    sample.features['document_id'] = document_id
                Preprocess.create_features(sequence)

                if sequence:
                    yield sequence
Ejemplo n.º 3
0
    def process_batch(batch, maca_config, toki_config_path):
        #indexes=[]
        batchC = []
        for index, line in batch:
            #indexes.append(i)
            batchC.append(line)

        results = Preprocess.maca(batchC, maca_config, toki_config_path)
        #self.log('MACA')
        #print('MACA', len(results), file=sys.stderr)
        sequences = []
        for res in results:
            result = Preprocess.parse(res)

            # TODO cechy
            sequence = []
            for form, space_before, interpretations in result:
                sample = Sample()
                sequence.append(sample)
                sample.features['token'] = form
                # print(interpretations)
                sample.features['tags'] = uniq([t for l, t in interpretations])
                sample.features['maca_lemmas'] = interpretations
                sample.features['space_before'] = [
                    'space_before'
                ] if space_before == 'space' else ['no_space_before']

            Preprocess.create_features(sequence)

            if sequence:
                yield sequence
Ejemplo n.º 4
0
def load_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True, mode='memnn'):
    #dataset = []
    dataset_ids = []
    #labels = []
    label_ids = []
    with open(input_file) as f:
        article = {}
        article_no = 0
        for line in f:
            line = line.strip()
            if len(line) > 0 and line[:2] == '1 ' and len(dataset_ids) > 0: # new article
                article = {}
                article_no += 1
            if '\t' in line: # question
                question_parts = line.split('\t')
                tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split()
                if update_word_ids:
                    for token in tokens[1:]:
                        if token not in word_to_id:
                            word_to_id[token] = word_id
                            word_id += 1
                    if question_parts[1] not in word_to_id:
                            word_to_id[question_parts[1]] = word_id
                            word_id += 1

                stmt_ids = map(int, question_parts[2].strip().split())
                sequence = []
                if mode == 'baseline':
                    for s in range(int(tokens[0])):
                        if s in article:
                            sequence += article[s]
                else:
                    for s in stmt_ids:
                        sequence += article[s]

                for token in tokens[1:]:
                    sequence.append(token)

                if article_no == 0:
                    print("seq: %s | label: %s" % (' '.join(sequence).ljust(70), question_parts[1]))

                dataset_ids.append(map(lambda t: word_to_id[t], sequence))
                label_ids.append(word_to_id[question_parts[1]])

            else: # statement
                tokens = re.sub(r'([\.\?])$', r' \1', line).split()
                if update_word_ids:
                    for token in tokens[1:]:
                        if token not in word_to_id:
                            word_to_id[token] = word_id
                            word_id += 1

                line_no = int(tokens[0])
                article[line_no] = []
                for token in tokens[1:]:
                    article[line_no].append(token)

    return dataset_ids, label_ids, word_to_id, word_id
Ejemplo n.º 5
0
def get_data(data_path, word_dict=None, label_dict=None, mode=None):
    # 处理词向量数据
    if mode == 'vec':
        words_vec = []
        with open(data_path, encoding='utf-8') as fr:
            lines = fr.readlines()
            for line in lines:
                if line != '\n':
                    word_vec = line.strip().split()[1:]
                    words_vec.append(word_vec)
        all_vec = list()
        for each in words_vec:
            each_vec = []
            for char in each:
                each_vec.append(eval(char))
            all_vec.append(each_vec)
        all_vec = np.asarray(all_vec)
        with open('./data/word_vec.pkl', 'wb') as fw:
            pickle.dump(all_vec, fw)
        return True
    # 建立词典
    elif mode == 'vocab':
        word_list = list()
        with open(data_path, encoding='utf-8') as fr:
            lines = fr.readlines()
            for line in lines:
                if line != '\n':
                    word_list.append(line.strip())
        # 特殊词
        special_word = ['pad', 'unknown']
        # 对词典表进行汇总
        word_list = special_word + word_list
        word_dict = dict()
        for key, value in enumerate(word_list):
            word_dict[value] = key
        return word_dict
    # 处理训练和测试数据
    else:
        data, labels = [], []
        with open(data_path, encoding='utf-8') as fr:
            lines = fr.readlines()
        sequence, tag = [], []
        for line in lines:
            if line != '\n':
                [char, label] = line.strip().split()
                sequence.append(char)
                tag.append(label)
            else:
                sequence_ids = [
                    word_dict[char]
                    if char in word_dict else word_dict['unknown']
                    for char in sequence
                ]
                tag_ids = [label_dict[label] for label in tag]
                data.append(sequence_ids)
                labels.append(tag_ids)
                sequence, tag = [], []
        return data, labels
Ejemplo n.º 6
0
    def deco2sentence(self, decoderOutputs):
        """Decode the output of the decoder and return a human friendly sentence
        decoderOutputs (list<np.array>):
        """
        sequence = []

        #Choose the words with the highest prediction score
        for out in decoderOutputs:
            sequence.append(np.argmax(out))
        return sequence
Ejemplo n.º 7
0
    def getStrokes(filename):
        tree = ET.parse(filename)
        root = tree.getroot()
        sequence = []
        xnarray = []
        ynarray = []
        tnarray = []
        xarray = []
        yarray = []
        tarray = []
        parray = []
        pointarray = [0]
        op = 0

        for stroke in root[1].findall('Stroke'):

            # time=[float(stroke.attrib['end_time'])-float(stroke.attrib['start_time']),0,0]
            #time_offset= float(stroke.attrib['start_time'])

            points = 0

            for point in stroke.findall('Point'):
                xarray.append(float(point.attrib['x']))
                yarray.append(float(point.attrib['y']))
                tarray.append(float(point.attrib['time']))
                parray.append(1)
                points = points + 1
                #points.append([float(point.attrib['x'])-x_offset,float(point.attrib['y'])-y_offset,float(point.attrib['time'])-time_offset])
            op = op + points
            pointarray.append(op)
            parray[-1] = 0
            parray[len(parray) - points] = 0

        xnarray.append(xarray[0])
        ynarray.append(yarray[0])
        tnarray.append(tarray[0])

        for i, j in zip(pointarray[:], pointarray[1:]):
            if (i != 0):
                xnarray.append(xarray[i] - xarray[i - 1])
                ynarray.append(yarray[i] - yarray[i - 1])
                tnarray.append(tarray[i] - tarray[i - 1])

            for point in range(i + 1, j):
                xnarray.append(xarray[point] - xarray[i])
                ynarray.append(yarray[point] - yarray[i])
                tnarray.append(tarray[point] - tarray[i])

        result = zip(tnarray, xnarray, ynarray, parray)
        sequence.append(result)
        #print (sequence)
        #print (sequence[][3][0],sequence[0][0][1],sequence[0][0][2],sequence[0][0][3])
        return sequence
Ejemplo n.º 8
0
        def getStrokes(filename):
            tree = ET.parse(filename)
            root = tree.getroot()
            sequence = []
            xnarray=[]
            ynarray=[]
            tnarray=[]
            xarray=[]
            yarray=[]
            tarray=[]
            parray=[]
            pointarray=[0]
            op=0
           
            for stroke in root[1].findall('Stroke'):
                         
                points=0
                
                for point in stroke.findall('Point'):
                    xarray.append(float(point.attrib['x']))
                    yarray.append(float(point.attrib['y']))
                    tarray.append(float(point.attrib['time']))
                    parray.append(1)
                    points=points+1    
                                  
                op=op+points
                pointarray.append(op)
                parray[-1]=0
                parray[len(parray)-points]=0
            
            xnarray.append(xarray[0])
            ynarray.append(yarray[0])
            tnarray.append(tarray[0])

            for i,j in zip(pointarray[:],pointarray[1:]):
                if (i!=0):
                    xnarray.append(abs(xarray[i]-xarray[i-1]))
                    ynarray.append(abs(yarray[i]-yarray[i-1]))
                    tnarray.append(abs(tarray[i]-tarray[i-1]))

                for point in range (i+1,j):
                    xnarray.append(abs(xarray[point]-xarray[i]))
                    ynarray.append(abs(yarray[point]-yarray[i]))
                    tnarray.append(abs(tarray[point]-tarray[i]))
            
	    xnarray[0]=0
	    ynarray[0]=0
       	    tnarray[0]=0
            result=zip(tnarray,xnarray,ynarray,parray)
            sequence.append(result)
            return sequence
Ejemplo n.º 9
0
    def process_batch_preana(batch: Iterable[Paragraph]) -> Generator[List[Sample], None, None]:
        for document_id, paragraph in batch:
            for sentence in paragraph:
                sequence = []
                for token in sentence:
                    sample = Sample()
                    sequence.append(sample)
                    sample.features['token'] = token.form
                    sample.features['tags'] = uniq([form.tags for form in token.interpretations])
                    sample.features['maca_lemmas'] = uniq([(form.lemma, form.tags) for form in token.interpretations])
                    sample.features['space_before'] = ['space_before'] if token.space_before else ['no_space_before']
                    sample.features['space_before'].append(token.space_before)
                    sample.features['document_id'] = document_id
                Preprocess.create_features(sequence)

                if sequence:
                    yield sequence
Ejemplo n.º 10
0
def preprocess_paragraph_reanalyzed(
        paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]:
    paragraph_sequence = []
    for sentence, sentence_gold in zip(paragraph, paragraph.concraft):
        valid_training_data = len(sentence_gold.tokens) == len(
            sentence.tokens) and len([
                token.gold_form
                for token in sentence.tokens if token.gold_form is None
            ]) == 0

        sequence = []
        for token in sentence.tokens:
            sample = Sample()
            sequence.append(sample)
            sample.features['token'] = token.form
            sample.features['tags'] = uniq(
                map(lambda form: form.tags, token.interpretations))
            if valid_training_data:
                sample.features['label'] = token.gold_form.tags
                sample.features['lemma'] = token.gold_form.lemma
            sample.features['space_before'] = [
                'space_before'
            ] if is_separator_before(
                token.space_before) else ['no_space_before']
            sample.features['tags4e3'] = create_token_features(
                sample.features['token'], sample.features['tags'],
                sample.features['space_before'])

        sequence_gold = []
        for token_gold in sentence_gold.tokens:
            sample = Sample()
            sequence_gold.append(sample)
            sample.features['token'] = token_gold.form
            if token_gold.gold_form is None:
                sample.features['label'] = 'ign'
            else:
                sample.features['label'] = token_gold.gold_form.tags
                sample.features['lemma'] = token_gold.gold_form.lemma
            sample.features['space_before'] = [
                'space_before'
            ] if is_separator_before(
                token_gold.space_before) else ['no_space_before']

        paragraph_sequence.append((sequence, sequence_gold))
    return paragraph_sequence
Ejemplo n.º 11
0
def preprocess_paragraph_preanalyzed(
        paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]:
    paragraph_sequence = []
    for sentence in paragraph:
        sequence = []
        for token in sentence.tokens:
            sample = Sample()
            sequence.append(sample)
            sample.features['token'] = token.form
            sample.features['tags'] = uniq(
                map(lambda form: form.tags, token.interpretations))
            sample.features['label'] = token.gold_form.tags
            sample.features['lemma'] = token.gold_form.lemma
            sample.features['space_before'] = [
                'space_before'
            ] if is_separator_before(
                token.space_before) else ['no_space_before']
            sample.features['tags4e3'] = create_token_features(
                sample.features['token'], sample.features['tags'],
                sample.features['space_before'])

        paragraph_sequence.append((sequence, sequence))
    return paragraph_sequence
Ejemplo n.º 12
0
def padding(sequence, seq_length):
    while len(sequence) < seq_length:
        sequence.append("<PAD>")
    while len(sequence) > seq_length:
        sequence = sequence[:-1]
    return sequence
Ejemplo n.º 13
0
def padding(sequence):
    while len(sequence) < 10:
        sequence.append("<PAD>")
    while len(sequence) > 10:
        sequence = sequence[:-1]
    return sequence