def pos_tag_file(self, input_path, output_path=None): """ POS Tag a file. Either we have a list of list (for each sentence a list of tuple (word,tag)) Or a file with the POS tagged text Note : The jumpline is only for readibility purpose , when reading a tagged file we'll use again sent_tokenize to find the sentences boundaries. :param input_path: path of the source file :param output_path: If set write POS tagged text with separator (self.pos_tag_raw_text with as_tuple_list False) If not set, return list of list of tuple (self.post_tag_raw_text with as_tuple_list = True) :return: resulting POS tagged text as a list of list of tuple or nothing if output path is set. """ original_text = read_file(input_path) if output_path is not None: tagged_text = self.pos_tag_raw_text(original_text, as_tuple_list=False) # Write to the output the POS-Tagged text. write_string(tagged_text, output_path) else: return self.pos_tag_raw_text(original_text, as_tuple_list=True)
if __name__ == '__main__': parser = argparse.ArgumentParser( description='Extract keyphrases from raw text') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-raw_text', help='raw text to process') group.add_argument('-text_file', help='file containing the raw text to process') parser.add_argument('-tagger_host', help='CoreNLP host', default='services.loadbalancer.api.questo.ai') parser.add_argument('-tagger_port', help='CoreNLP port', default=9000) parser.add_argument('-N', help='number of keyphrases to extract', required=True, type=int) args = parser.parse_args() if args.text_file: raw_text = read_file(args.text_file) else: raw_text = args.raw_text embedding_distributor = load_local_embedding_distributor() pos_tagger = load_local_corenlp_pos_tagger(args.tagger_host, args.tagger_port) print( extract_keyphrases(embedding_distributor, pos_tagger, raw_text, args.N, 'en'))
tagged_text = list(raw_tag_text()) if as_tuple_list: return tagged_text return '[ENDSENT]'.join( [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text]) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Write POS tagged files, the resulting file will be written' ' at the same location with _POS append at the end of the filename') parser.add_argument('tagger', help='which pos tagger to use [stanford, spacy, corenlp]') parser.add_argument('listing_file_path', help='path to a text file ' 'containing in each row a path to a file to POS tag') args = parser.parse_args() if args.tagger == 'stanford': pt = PosTaggingStanford() suffix = 'STANFORD' elif args.tagger == 'spacy': pt = PosTaggingSpacy() suffix = 'SPACY' elif args.tagger == 'corenlp': pt = PosTaggingCoreNLP() suffix = 'CoreNLP' list_of_path = read_file(args.listing_file_path).splitlines() print('POS Tagging and writing ', len(list_of_path), 'files') pt.pos_tag_and_write_corpora(list_of_path, suffix)