def pos_tag_file(self, input_path, output_path=None):

        """
        POS Tag a file.
        Either we have a list of list (for each sentence a list of tuple (word,tag))
        Or a file with the POS tagged text

        Note : The jumpline is only for readibility purpose , when reading a tagged file we'll use again
        sent_tokenize to find the sentences boundaries.

        :param input_path: path of the source file
        :param output_path: If set write POS tagged text with separator (self.pos_tag_raw_text with as_tuple_list False)
                            If not set, return list of list of tuple (self.post_tag_raw_text with as_tuple_list = True)

        :return: resulting POS tagged text as a list of list of tuple or nothing if output path is set.
        """

        original_text = read_file(input_path)

        if output_path is not None:
            tagged_text = self.pos_tag_raw_text(original_text, as_tuple_list=False)
            # Write to the output the POS-Tagged text.
            write_string(tagged_text, output_path)
        else:
            return self.pos_tag_raw_text(original_text, as_tuple_list=True)
Example #2
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract keyphrases from raw text')

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-raw_text', help='raw text to process')
    group.add_argument('-text_file',
                       help='file containing the raw text to process')

    parser.add_argument('-tagger_host',
                        help='CoreNLP host',
                        default='services.loadbalancer.api.questo.ai')
    parser.add_argument('-tagger_port', help='CoreNLP port', default=9000)
    parser.add_argument('-N',
                        help='number of keyphrases to extract',
                        required=True,
                        type=int)
    args = parser.parse_args()

    if args.text_file:
        raw_text = read_file(args.text_file)
    else:
        raw_text = args.raw_text

    embedding_distributor = load_local_embedding_distributor()
    pos_tagger = load_local_corenlp_pos_tagger(args.tagger_host,
                                               args.tagger_port)
    print(
        extract_keyphrases(embedding_distributor, pos_tagger, raw_text, args.N,
                           'en'))
        tagged_text = list(raw_tag_text())

        if as_tuple_list:
            return tagged_text
        return '[ENDSENT]'.join(
            [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Write POS tagged files, the resulting file will be written'
                                                 ' at the same location with _POS append at the end of the filename')

    parser.add_argument('tagger', help='which pos tagger to use [stanford, spacy, corenlp]')
    parser.add_argument('listing_file_path', help='path to a text file '
                                                  'containing in each row a path to a file to POS tag')
    args = parser.parse_args()

    if args.tagger == 'stanford':
        pt = PosTaggingStanford()
        suffix = 'STANFORD'
    elif args.tagger == 'spacy':
        pt = PosTaggingSpacy()
        suffix = 'SPACY'
    elif args.tagger == 'corenlp':
        pt = PosTaggingCoreNLP()
        suffix = 'CoreNLP'

    list_of_path = read_file(args.listing_file_path).splitlines()
    print('POS Tagging and writing ', len(list_of_path), 'files')
    pt.pos_tag_and_write_corpora(list_of_path, suffix)