def prepareSentence2(sentence): sentenceParseResult = parse_text(sentence) sentenceLemmatized = lemmatize(sentenceParseResult) sentencePosTagged = posTag(sentenceParseResult) sentenceLemmasAndPosTags = [] for i in range(len(sentenceLemmatized)): sentenceLemmasAndPosTags.append([]) for i in range(len(sentenceLemmatized)): for item in sentenceLemmatized[i]: sentenceLemmasAndPosTags[i].append(item) sentenceLemmasAndPosTags[i].append(sentencePosTagged[i][3]) words = [] for rawWord in sentenceLemmasAndPosTags: word = Word(rawWord[1] - 1, rawWord[2]) word.lemma = rawWord[3] word.pos = rawWord[4] words.append(word) return words
def _get_words(raw_sentence): words = [] for i, item in enumerate(raw_sentence['words']): word = Word(i + 1, item[0]) word.lemma = item[1]['Lemma'] word.pos = item[1]['PartOfSpeech'].lower() word.ner = item[1]['NamedEntityTag'] words.append(word) return words
def load(path_input): sentences = [] sentence = [] with codecs.open(path_input,'r', 'utf8') as f: lines = f.readlines() for line in lines: if line == '\n': sentences.append(sentence) sentence = [] continue parts = line.strip().split('\t') word = Word(parts[0], parts[1]) # punctuation head is root word.lemma = parts[2] word.pos = parts[4] word.dep = parts[7] word.head = parts[6] sentence.append(word) sentences.append(sentence) return sentences