Esempio n. 1
0
def read_pos(pos_tag_file):
    sentences = []
    sentence = [None]
    previous_line = ''
    for line in read_lines(pos_tag_file)[:-1]:
        if not ((line == '!\tPUNCT') or (line == '.\tPUNCT') or
                (line == '?\tPUNCT')):
            sentence.append(line.split('\t')[1])
        else:
            if (len(sentence) != 1):
                sentence.append(line.split('\t')[1])
                sentences.append(sentence)
            sentence = [None]
        previous_line = line
    return sentences
Esempio n. 2
0
def write_sentences(sentences, filename, preamble):
    write_lines(filename, preamble + flatten_sentences(sentences))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument('--train', type=float, default=0.7)
    parser.add_argument('--test', type=float, default=0.2)
    parser.add_argument("--cv", action="store_true")

    args = parser.parse_args()

    # Source file
    tokens_with_entities = add_bio_prefixes(
        [line.split(' ') for line in read_lines(STANFORD_FILE)])
    # Stanford output
    tokens_with_pos_and_phrase_types = [
        line.split(' ') for line in read_lines(POS_FILE)[:]
    ]

    # Merge two files
    preamble = list(
        map(lambda item: ' '.join(item), tokens_with_pos_and_phrase_types[:2]))
    sentences = []
    sentence = []
    i = 2
    mismatching_tokens_counter = 0
    max_number_of_mismatching_tokens = 100
    skip_iterations = 0
    for j in range(len(tokens_with_entities)):
Esempio n. 3
0
from convert import read_lines, write_lines

STANFORD_FILE = 'stanford-ner-corpus.txt'

tokens = [line.split(' ')[0] for line in read_lines(STANFORD_FILE)]

write_lines('text.txt', [
    ' '.join(tokens),
])
Esempio n. 4
0
                sentence.append(line.split('\t')[1])
                sentences.append(sentence)
            sentence = [None]
        previous_line = line
    return sentences


STANFORD_DEPS_FILE = 'text.txt.out'
POS_FILE = 'pos.tag'
PHRASE_TYPES_FILE = 'phrase_types.txt'

if __name__ == '__main__':
    sentences = []
    sentence = []
    last_line = ''
    for line in read_lines(STANFORD_DEPS_FILE):
        last_line = line
        if not line.startswith('Sentence'):
            sentence.append(line)
            #sentence.append(decode_dependency(line))
        else:
            sentence = sentence[3:-1]
            #print(sentence)
            #print(len(sentence))
            #print(sentence)
            if len(sentence) <= 4:
                sentence = []
                continue
            #print(sentence[sentence.index('') + 2:])
            #print(sentence[3:sentence.index('')])
            #print(sentence[:sentence.index('')])
from convert import read_lines, write_lines
structure_stanford_deps = __import__('structure-stanford-deps')

STANFORD_POS_FILE = 'ru-conll2003.txt'
STANFORD_FILE = 'stanford-ner-corpus.txt'
POS_FILE = 'phrase_types.txt'

tokens_with_entities = [line.split(' ') for line in read_lines(STANFORD_FILE)]
tokens_with_pos_and_phrase_types = [
    line.split(' ') for line in read_lines(POS_FILE)[:]
]

# Add bio prefixes for entities labels
no_entity_mark = 'O'
previous_entity = ''
new_tokens_with_entities = []
for token, entity in tokens_with_entities:
    new_tokens_with_entities.append((
        token,
        f'{structure_stanford_deps.get_bio_prefix(previous_entity, entity) if entity != no_entity_mark else ""}{entity}'
    ))
    previous_entity = entity
tokens_with_entities = new_tokens_with_entities

#Merge
lines = list(
    map(lambda item: ' '.join(item), tokens_with_pos_and_phrase_types[:2]))
i = 2
for token, entity in tokens_with_entities:
    if i >= len(tokens_with_pos_and_phrase_types):
        break
from convert import read_lines, write_lines

POS_FILE = 'stanford-ner-corpus.txt'
PHRASE_TYPES_FILE = 'phrase_types.txt'

pos_tokens = list(map(lambda line: line.split(' ')[0], read_lines(POS_FILE)))
phrase_types = list(
    map(lambda line: line.split(' ')[0], read_lines(PHRASE_TYPES_FILE)))

counter = 1000
for i in range(len(pos_tokens)):
    if pos_tokens[i] != phrase_types[i]:
        counter -= 1
        print(i, pos_tokens[i], phrase_types[i])
    if (counter < 0):
        break