def analyze(filename, known_words_filepath, not_known_words_filepath, print_example): known_words = set(FileUtils.read(known_words_filepath).split()) not_known_words = set(FileUtils.read(not_known_words_filepath).split()) tmp_filepath = FileUtils.random_path() output_filepath = tmp_filepath + '.xml' FileUtils.copy(filename, tmp_filepath) croncob_word_list = os.path.join('data', 'corncob_lowercase.txt') word_filter = WordFilterFactory.create_word_filter(croncob_word_list) cmd = ['java', '-cp', 'stanford-corenlp-full/stanford-corenlp-3.3.1.jar:stanford-corenlp-full/stanford-corenlp-3.3.1-models.jar:stanford-corenlp-full/xom.jar:stanford-corenlp-full/joda-time.jar:stanford-corenlp-full/jollyday.jar:stanford-corenlp-full/ejml-0.23.jar', '-Xmx2g', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit,pos,lemma', '-file', tmp_filepath, '-outputDirectory', '/tmp/' ] subprocess.call(cmd) raw_output = FileUtils.read(output_filepath) d = xmltodict.parse(raw_output) sentences = d['root']['document']['sentences']['sentence'] candidate_words = defaultdict(dict) def word_filter_fun(word, lemma, tag): del word del tag return word_filter.isok(lemma) def adjective_filter_fun(word, lemma, tag): del word del lemma if tag in ['JJR', 'JJS']: return False else: return True filters = [ word_filter_fun, adjective_filter_fun ] for sentence_dict in sentences: tokens = sentence_dict['tokens']['token'] if not isinstance(tokens, list): continue last_offset = int(tokens[0]['CharacterOffsetBegin']) sentence_raw = '' for token in tokens: word = token['word'] begin_offset = int(token['CharacterOffsetBegin']) sentence_raw += (begin_offset - last_offset) * ' ' sentence_raw += word last_offset = int(token['CharacterOffsetEnd']) for token in tokens: word = token['word'] lemma = token['lemma'] tag = token['POS'] if tag in TAG_TO_PART_OF_SPEECH: ok = True for filter_fun in filters: if not filter_fun(word, lemma, tag): ok = False break if ok: candidate_words[(lemma, TAG_TO_PART_OF_SPEECH[tag])] = { 'example_sentence': sentence_raw, 'word': word } not_known = [] for ((lemma, part_of_speech), d) in candidate_words.iteritems(): if lemma not in known_words and lemma not in not_known_words: not_known.append((lemma, part_of_speech, d)) for (lemma, part_of_speech, d) in not_known: word = d['word'] example_sentence = d['example_sentence'] out = '(%s.) %s' % ( convert_part_of_speech(part_of_speech), lemma ) if print_example: line = utils.fill_suffix(out, 22, ' ') + ' # ' + example_sentence match_pos = re.search(word, example_sentence).start() print line.encode('utf-8') print ((match_pos + 25) * ' ') + (len(word) * '^') else: print out.encode('utf-8')
def analyze(filename, known_words_filepath, not_known_words_filepath, print_example): known_words = set(FileUtils.read(known_words_filepath).split()) not_known_words = set(FileUtils.read(not_known_words_filepath).split()) tmp_filepath = FileUtils.random_path() output_filepath = tmp_filepath + '.xml' FileUtils.copy(filename, tmp_filepath) croncob_word_list = os.path.join('data', 'corncob_lowercase.txt') word_filter = WordFilterFactory.create_word_filter(croncob_word_list) cmd = [ 'java', '-cp', 'stanford-corenlp-full/stanford-corenlp-3.3.1.jar:stanford-corenlp-full/stanford-corenlp-3.3.1-models.jar:stanford-corenlp-full/xom.jar:stanford-corenlp-full/joda-time.jar:stanford-corenlp-full/jollyday.jar:stanford-corenlp-full/ejml-0.23.jar', '-Xmx2g', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit,pos,lemma', '-file', tmp_filepath, '-outputDirectory', '/tmp/' ] subprocess.call(cmd) raw_output = FileUtils.read(output_filepath) d = xmltodict.parse(raw_output) sentences = d['root']['document']['sentences']['sentence'] candidate_words = defaultdict(dict) def word_filter_fun(word, lemma, tag): del word del tag return word_filter.isok(lemma) def adjective_filter_fun(word, lemma, tag): del word del lemma if tag in ['JJR', 'JJS']: return False else: return True filters = [word_filter_fun, adjective_filter_fun] for sentence_dict in sentences: tokens = sentence_dict['tokens']['token'] if not isinstance(tokens, list): continue last_offset = int(tokens[0]['CharacterOffsetBegin']) sentence_raw = '' for token in tokens: word = token['word'] begin_offset = int(token['CharacterOffsetBegin']) sentence_raw += (begin_offset - last_offset) * ' ' sentence_raw += word last_offset = int(token['CharacterOffsetEnd']) for token in tokens: word = token['word'] lemma = token['lemma'] tag = token['POS'] if tag in TAG_TO_PART_OF_SPEECH: ok = True for filter_fun in filters: if not filter_fun(word, lemma, tag): ok = False break if ok: candidate_words[(lemma, TAG_TO_PART_OF_SPEECH[tag])] = { 'example_sentence': sentence_raw, 'word': word } not_known = [] for ((lemma, part_of_speech), d) in candidate_words.iteritems(): if lemma not in known_words and lemma not in not_known_words: not_known.append((lemma, part_of_speech, d)) for (lemma, part_of_speech, d) in not_known: word = d['word'] example_sentence = d['example_sentence'] out = '(%s.) %s' % (convert_part_of_speech(part_of_speech), lemma) if print_example: line = utils.fill_suffix(out, 22, ' ') + ' # ' + example_sentence match_pos = re.search(word, example_sentence).start() print line.encode('utf-8') print((match_pos + 25) * ' ') + (len(word) * '^') else: print out.encode('utf-8')