Beispiel #1
0
    def __init__(self, core_nlp_version: str = '4.1.0'):
        self.install_dir = Path('~/.stanfordnlp_resources/').expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if len([d for d in self.install_dir.glob('*') if d.is_dir()]) == 0:
            # No coreNLP directories. Let's check for ZIP archives as well.
            zip_files = [
                d for d in self.install_dir.glob('*') if d.suffix == '.zip'
            ]
            if len(zip_files) == 0:
                # No dir and no ZIP. Let's download it with the desired core_nlp_version.
                remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-{}.zip'.format(
                    core_nlp_version)
                print('Downloading from %s.' % remote_url)
                output_filename = wget.download(remote_url,
                                                out=str(self.install_dir))
                print('\nExtracting to %s.' % self.install_dir)
            else:
                output_filename = zip_files[0]
            print('Unzip %s.' % output_filename)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()
        target_dir = [d for d in self.install_dir.glob('*') if d.is_dir()][0]

        os.environ['CORENLP_HOME'] = str(self.install_dir / target_dir)
        from stanfordnlp.server import CoreNLPClient
        self.client = CoreNLPClient(annotators=['openie'], memory='8G')
class Tokenizer:
    def __init__(self) -> None:
        os.environ[
            'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(
                os.environ['HOME'])
        self.client = CoreNLPClient(annotators=['ssplit'])
        self.client.ensure_alive()
        self.do_lower_case = '-cased' not in config.bert_model
        self.basic_tokenizer: BasicTokenizer \
            = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer

    def tokenize(self, doc: str) -> List[List[Token]]:
        corenlp_annotation = self.client.annotate(doc)
        sentences = []
        for sentence in corenlp_annotation.sentence:
            text = doc[sentence.characterOffsetBegin:sentence.
                       characterOffsetEnd]
            if self.do_lower_case:
                text = text.lower()
            offset = sentence.characterOffsetBegin
            bert_tokens = self.basic_tokenizer.tokenize(text)
            begin = 0
            tokens = []
            for bert_token in bert_tokens:
                word = bert_token
                begin = text.index(word, begin)
                end = begin + len(word)
                tokens.append(Token(word, begin + offset, end + offset))
                begin = end
            if len(tokens) > 0:
                sentences.append(tokens)
        return sentences
Beispiel #3
0
def get_corenlp_client(corenlp_path, corenlp_port):
    from stanfordnlp.server import CoreNLPClient

    annotators = ["tokenize", "ssplit"]

    os.environ["CORENLP_HOME"] = corenlp_path
    if is_port_occupied(port=corenlp_port):
        try:
            corenlp_client = CoreNLPClient(annotators=annotators,
                                           timeout=99999,
                                           memory='4G',
                                           endpoint="http://localhost:%d" %
                                           corenlp_port,
                                           start_server=False,
                                           be_quiet=False)
            return corenlp_client
        except Exception as err:
            raise err
    else:
        print("Starting corenlp client at port {}".format(corenlp_port))
        corenlp_client = CoreNLPClient(annotators=annotators,
                                       timeout=99999,
                                       memory='4G',
                                       endpoint="http://localhost:%d" %
                                       corenlp_port,
                                       start_server=True,
                                       be_quiet=False)
        return corenlp_client
Beispiel #4
0
    def __init__(self,
                 core_nlp_version: str = '2018-10-05',
                 threads: int = 5,
                 close_after_finish: bool = True):
        self.remote_url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format(
            core_nlp_version)
        self.install_dir = Path(os.environ['STANFORD_HOME']).expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if not (self.install_dir / Path(
                'stanford-corenlp-full-{}'.format(core_nlp_version))).exists():
            print('Downloading to %s.' % self.install_dir)
            output_filename = wget.download(self.remote_url,
                                            out=str(self.install_dir))
            print('\nExtracting to %s.' % self.install_dir)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()

        os.environ['CORENLP_HOME'] = str(self.install_dir /
                                         'stanford-corenlp-full-2018-10-05')
        from stanfordnlp.server import CoreNLPClient
        self.close_after_finish = close_after_finish
        self.client = CoreNLPClient(annotators=['openie'],
                                    memory='8G',
                                    threads=threads)
Beispiel #5
0
    def make_corenlp_client(self,
                            annotators=["tokenize", "ssplit"],
                            endpoint="http://localhost:9000",
                            properties_name="french",
                            properties_dict=None,
                            quiet=True):
        LEGACY_PROPERTIES = {}
        FRENCH_PROPERTIES = {
            "tokenize.language": "French",
            "tokenize.options": "ptb3Dashes=true"
        }
        PROPERTIES = {"legacy": LEGACY_PROPERTIES, "french": FRENCH_PROPERTIES}
        if properties_dict is not None:
            properties = properties_dict
        else:
            if properties_name in PROPERTIES.keys():
                properties = PROPERTIES[properties_name]
            else:
                raise ValueError("Unknow properties '%s'" % properties_name)

        devnull = open(os.devnull)
        stdout = devnull if quiet else sys.stdout
        stderr = devnull if quiet else sys.stderr
        self.corenlp_client = \
            CoreNLPClient(annotators=annotators,
                          endpoint=endpoint,
                          stdout=stdout,
                          stderr=stderr,
                          memory="8G",
                          heapsize="8G",
                          threads=8,
                          timeout=15000,
                          properties=properties
                          )
 def __init__(self) -> None:
     os.environ['CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(os.environ['HOME'])
     self.client = CoreNLPClient()
     self.client.ensure_alive()
     self.do_lower_case = '-cased' not in config.bert_model
     self.basic_tokenizer: BasicTokenizer \
         = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer
Beispiel #7
0
    def __init__(self, tagger='spacy', language='french'):
        self.tagger = tagger
        self.tagmodule = None
        self.tagset = UTagSet  # TAG Set by default
        self.language = language
        spacy_module = {
            'french': 'fr_core_news_sm',
            'english': 'en_core_web_sm'
        }

        if tagger == 'spacy':
            self.tagger = self.spacy_pos_tag
            self.tagset = UDTagSet
            try:
                self.tagmodule = spacy.load(spacy_module[language])
            except:
                logger.warning(
                    'Module for language [{:s}] not installed for Spacy - using french by default'
                    .format(language))
                self.tagmodule = spacy.load(spacy_module['french'])
        elif tagger == 'stanford':
            self.tagger = self.stanford_pos_tag
            self.tagset = FTTagSet
            JAVAHOME = "C:/Program Files (x86)/Java/jre1.8.0_241/bin/java.exe"
            # Set a JAVAHOME environment variable if not present
            if not 'JAVAHOME' in os.environ:
                os.environ['JAVAHOME'] = JAVAHOME
            root_path = "./stanford-postagger/"  # location of Stanford POS Tagger components

            # Launch the Stanford Pos Tagger (implemented in Java)
            self.tagmodule = StanfordPOSTagger(
                root_path + "models/" + language + ".tagger",
                root_path + "stanford-postagger.jar",
                encoding='utf8')
        elif tagger == 'core_nlp':
            self.tagger = self.corenlp_pos_tag
            os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05'
            try:
                self.tagmodule = CoreNLPClient(properties=language,
                                               annotators=[
                                                   'pos',
                                               ],
                                               timeout=30000,
                                               memory='1G')
            except:
                logger.warning(
                    'Could not launch Stanford Core NLP for [{:s}]'.format(
                        language))
        elif tagger == 'nltk':
            self.tagger = self.nltk_pos_tag
            self.tagset = NLTKTagSet
            if language != 'english':
                logger.warning(
                    'nltk does not support [{:s}] language'.format(language))
        else:
            logger.warning('POS tagger [{:s}] unknown'.format(tagger))
Beispiel #8
0
    def __init__(self, corenlp_home, endpoint='http://localhost:9000', timeout=15000, memory='2G'):
        print('Set up Stanford CoreNLP Server.')

        if os.path.exists(corenlp_home):
            os.environ['CORENLP_HOME'] = corenlp_home
        else:
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), corenlp_home)

        self.client = CoreNLPClient(annotators=['depparse'], endpoint=endpoint, timeout=timeout, memory=memory)
        self.client.annotate('Prepare.')
 def __init__(self,
              start_server=True,
              endpoint=CoreNLPClient.DEFAULT_ENDPOINT):
     self.__client = CoreNLPClient(start_server=start_server,
                                   endpoint=endpoint,
                                   annotators=[
                                       'tokenize', 'ssplit', 'pos', 'lemma',
                                       'ner', 'parse', 'coref'
                                   ],
                                   output_format='json')
     self.__client.start()
Beispiel #10
0
def serve_stanfordnlp_client():

    return CoreNLPClient(endpoint='http://localhost:9000',
                         timeout=30000,
                         threads=4,
                         annotators='sentiment',
                         memory='8G')
Beispiel #11
0
def syntactic_parse_texts(texts: List[str], tokenize=False, sentence_split=False, verbose=False):
    if verbose:
        print(f"Parsing {len(texts)} texts...")

    corenlp_annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'depparse']
    annotators_properties = {'tokenize.whitespace': not tokenize,
                             'ssplit.eolonly': not sentence_split,
                             'depparse.model': "edu/stanford/nlp/models/parser/nndep/english_SD.gz",
                             'outputFormat': 'json'}
    if not STANFORD_CORENLP_DIR.exists():
        download_stanford_corenlp()
    os.environ['CORENLP_HOME'] = str(STANFORD_CORENLP_DIR)

    parse_results = []
    with CoreNLPClient(annotators=corenlp_annotators) as client:
        for text in tqdm(texts, disable=(not verbose)):
            if isinstance(text, List):
                text = ' '.join(text)
            raw_parse_result = client.annotate(text, properties=annotators_properties)
            parse_result = format_parser_output(raw_parse_result['sentences'])

            if len(parse_result['sentences']) > 1 and not sentence_split:
                parse_result = join_parse_result(parse_result)
            elif sentence_split:
                parse_result = split_parse_result(parse_result['sentences'])

            parse_results.append(parse_result)

    return parse_results
Beispiel #12
0
def process_book(header_annot_dir, lemma_dir, tree_dir, book_id):

    if os.path.exists(os.path.join(
            tree_dir, book_id + '.xml')) and os.path.exists(
                os.path.join(lemma_dir, book_id + '.pkl')):
        return book_id, 'Exists'

    os.environ[
        "CORENLP_HOME"] = "~/stanford_corenlp/stanford-corenlp-full-2018-10-05"

    try:
        with CoreNLPClient(annotators=['tokenize', 'lemma'],
                           timeout=30000,
                           max_char_length=100000000,
                           be_quiet=True,
                           start_server=False) as client:
            tree, para_end_sentences, lemma_dict = sentencize(
                header_annot_dir, client, book_id)

        tree2 = paragraphize(tree, para_end_sentences)

        filename = os.path.join(tree_dir, book_id + '.xml')
        tree2.write(filename, pretty_print=True)

        with open(os.path.join(lemma_dir, book_id + '.pkl'), 'wb') as f:
            pickle.dump(lemma_dict, f)
    except Exception as e:
        print(book_id, e)
        return book_id, e

    print(book_id, 'Success!')
    return book_id, 'Success'
Beispiel #13
0
 def tokenize(data, src_keys=['title', 'body'], tgt_key='text'):
     """Use Stanford CoreNLP tokenizer to tokenize all the documents."""
     REMAP = {
         "-LRB-": "(",
         "-RRB-": ")",
         "-LCB-": "{",
         "-RCB-": "}",
         "-LSB-": "[",
         "-RSB-": "]",
         "``": '"',
         "''": '"'
     }
     with CoreNLPClient(annotators=['tokenize', 'ssplit'], threads=CPU_CNT)\
             as client:
         for did, d in tqdm(data.items()):
             text = ''
             for k in src_keys:
                 text += d[k] + ' '
             ann = client.annotate(text.strip())
             tokens = []  # list of tokenized sentences
             for sent in ann.sentence:
                 tokens.append([
                     REMAP[t.word] if t.word in REMAP else t.word.lower()
                     for t in sent.token
                 ])
             d[tgt_key] = tokens
def process_document(doc, doc_id=None):
    """Main method: Annotate a document using CoreNLP client

    Arguments:
        doc {str} -- raw string of a document
        doc_id {str} -- raw string of a document ID

    Returns:
        sentences_processed {[str]} -- a list of processed sentences with NER tagged
            and MWEs concatenated
        doc_ids {[str]} -- a list of processed sentence IDs [docID1_1, docID1_2...]
        Example:
            Input: "When I was a child in Ohio, I always wanted to go to Stanford University with respect to higher education.
            But I had to go along with my parents."
            Output: 
            
            'when I be a child in ['when I be a child in [NER:LOCATION]Ohio , I always want to go to [NER:ORGANIZATION]Stanford_University with_respect_to higher education . 
            'but I have to go_along with my parent . '

            doc1_1
            doc1_2
    
    Note:
        When the doc is empty, both doc_id and sentences processed will be too. (@TODO: fix for consistensy)
    """
    with CoreNLPClient(endpoint="http://localhost:9002",
                       start_server=False) as client:
        doc_ann = client.annotate(doc)
    sentences_processed = []
    doc_sent_ids = []
    for i, sentence in enumerate(doc_ann.sentence):
        sentences_processed.append(process_sentence(sentence))
        doc_sent_ids.append(str(doc_id) + "_" + str(i))
    return "\n".join(sentences_processed), "\n".join(doc_sent_ids)
Beispiel #15
0
    def _preprocess_and_save(self, data, cached_examples_dir):
        index = dict()
        index['guids'] = []
        index['feafile_name'] = []
        index['offset'] = []
        index['label_ids'] = []
        feafile_name = "fea"

        output = dict()

        for key in self.feature_keys:
            output[key] = list()
        with CoreNLPClient(annotators=['natlog'], timeout=60000,
                           memory='16G') as client:
            for ex in tqdm(data):
                # # preprocess
                # if self.drop_unk_samples and ex["gold_label_id"] not in self.id2label_dict.keys():
                #     continue
                this_output = self._data2feature(ex, client)

                if len(this_output) == 0:
                    continue
                for key in self.feature_keys:
                    output[key].append(this_output[key])

        output = self._fea2tensor(output)

        index["guids"] = output["guids"]
        index['feafile_name'] = [feafile_name] * len(index["guids"])
        index['offset'] = list(range(len(index["guids"])))
        index["label_ids"] = output["label_ids"].numpy()

        torch.save(output, os.path.join(cached_examples_dir, feafile_name))
        self._save_index(cached_examples_dir, index)
        return
Beispiel #16
0
    def __init__(self, tagged_dataset_path, database_path, corenlp_path):
        self.target_values_map = {}
        for filename in os.listdir(tagged_dataset_path):
            filename = os.path.join(tagged_dataset_path, filename)
            print(sys.stderr, 'Reading dataset from', filename)
            with open(filename, 'r', 'utf8') as fin:
                header = fin.readline().rstrip('\n').split('\t')
                for line in fin:
                    stuff = dict(zip(header, line.rstrip('\n').split('\t')))
                    ex_id = stuff['id']
                    original_strings = tsv_unescape_list(stuff['targetValue'])
                    canon_strings = tsv_unescape_list(stuff['targetCanon'])
                    self.target_values_map[ex_id] = to_value_list(
                        original_strings, canon_strings)

        os.environ['CORENLP_HOME'] = corenlp_path
        self.client = CoreNLPClient(annotators="ner".split())
        self.db_path = database_path
Beispiel #17
0
def get_corenlp_client(corenlp_path, corenlp_port):
    os.environ["CORENLP_HOME"] = corenlp_path

    assert not is_port_occupied(
        corenlp_port), "Port {} is occupied by other process".format(
            corenlp_port)
    corenlp_client = CoreNLPClient(
        annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'],
        timeout=60000,
        memory='5G',
        endpoint="http://localhost:%d" % corenlp_port,
        start_server=True,
        be_quiet=False)
    corenlp_client.annotate(
        "hello world",
        annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'],
        output_format="json")
    return corenlp_client
Beispiel #18
0
def preprocess_data(args):

    os.environ['CORENLP_HOME'] = args.corenlp_dir
    texts = []
    if "Treebank" in args.parse_type:
        print('Join the separated edus in *.edus file into *.text file with a single line...')
        texts = [(join_edus(fedu),fname) for fedu, fname in [(os.path.join(args.data_dir, fname),fname) for fname in os.listdir(args.data_dir) if fname.endswith('.edus')]]
    elif args.parse_type == "Wiki":
        data = pd.read_excel(os.path.join(args.data_dir, "Wikipedia_afd_persuasive.xlsx"))
        texts = [(text, '') for text in data['rationale'].values]
    file_list = []
    corenlp_list = []
    save_path = os.path.join(args.output_dir, args.parse_type)
    if "Treebank" in args.parse_type:
        save_path = os.path.join(args.output_dir, args.parse_type, args.data_dir.split("/")[2])
    if not os.path.exists(os.path.join(args.output_dir, args.parse_type, "corenlp_data.p")) or not os.path.getsize(os.path.join(args.output_dir, args.parse_type, "corenlp_data.p")):
        with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse'], timeout=30000, memory='16G',
                           output_format='xml') as client:
            for text, fname in texts:
                print(text)
                if text and not pd.isna(text):
                    if args.parse_type == "Wiki":
                        regular = re.compile(r"\[http.*]")
                        stop = ["Keep ", "*<sKeep or </s", "Delete - ", "Weak Keep - ", "<s>Delete</s -", "Keep - ", "Delete ",
                                "Keep - <s>", "Keep, ", "Strong Keep - ", "Keep both - ",
                                "Keep per [[WP:NEXIST]]. ", "Keep per [[WP:SUSTAINED]]. ", "Keep,", "*<sWeak Keep.",
                                "*<sDelete", "Keep<br>", "<sKeep", "Keep&mdash;", "Delete, ",
                                "*<sDelete: ", "delete ", "*<sKeep. ", "**<delDelete. ", "::<sKeep ", "Keep--", "Keep - <s>"]
                        for s in stop:
                            text = text.replace(s, "")
                        re_list = re.findall(regular, text)
                        for r in re_list:
                            text = text.replace(r, "link")

                    ann = client.annotate(text)
                else:
                    ann = ''
                corenlp_list.append((ann, fname))

        with open(os.path.join(save_path, "corenlp_data.p"), 'wb') as file:
            pickle.dump(corenlp_list, file)

    with open(os.path.join(save_path, "corenlp_data.p"), 'rb') as file:
        corenlp_list = pickle.load(file)
        for ann, fname in corenlp_list:
            # print(ann)
            if "Treebank" in args.parse_type:
                lines = merge_treebank(ann, os.path.join(args.data_dir, fname))
            elif args.parse_type == "Wiki":
                if not ann:
                    lines = []
                else:
                    lines = merge(ann)
            file_list.append(lines)

    with open(os.path.join(save_path, "processed_data.p"), 'wb') as file:
        pickle.dump(file_list, file)
class StanfordCoreferenceResolver(CoreferenceResolver):
    def __init__(self,
                 start_server=True,
                 endpoint=CoreNLPClient.DEFAULT_ENDPOINT):
        self.__client = CoreNLPClient(start_server=start_server,
                                      endpoint=endpoint,
                                      annotators=[
                                          'tokenize', 'ssplit', 'pos', 'lemma',
                                          'ner', 'parse', 'coref'
                                      ],
                                      output_format='json')
        self.__client.start()

    def __del__(self):
        self.__client.stop()

    def resolve_coreferences(self, text, entities):
        annotations = self.__client.annotate(text)

        entity_mention_indices = []
        for chain in annotations.corefChain:
            mention_indices = []
            for mention in chain.mention:
                sentence = annotations.sentence[mention.sentenceIndex]
                token_start = sentence.token[mention.beginIndex]
                token_end = sentence.token[mention.endIndex - 1]
                char_start = token_start.beginChar
                char_end = token_end.endChar
                mention_indices.append((char_start, char_end))
            entity_mention_indices.append(mention_indices)

        entity_sets = [list() for _ in range(len(entity_mention_indices))]
        for entity in entities:
            is_coreferred = False
            for i, mention_indices in enumerate(entity_mention_indices):
                for start_index, end_index in mention_indices:
                    if entity.start_offset >= start_index and entity.end_offset <= end_index:
                        entity_sets[i].append(entity)
                        is_coreferred = True
            if not is_coreferred:
                entity_sets.append([entity])
        return entity_sets
def start_corenlp_client():
    corenlp_client = CoreNLPClient(
        start_server=True,
        endpoint='http://localhost:9000',
        memory=MEMORY_CORENLP,
        threads=50,
        timeout=10000000,
        annotators=['openie'],
        output_format="json",
        properties={
            'annotators': 'openie',
            'inputFormat': 'text',
            'outputFormat': 'json',
            'serializer':
            'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer',
            'openie.affinity_probability_cap': '1.0',
            "openie.max_entailments_per_clause": "500"
        })
    corenlp_client.TIMEOUT = 100
    return corenlp_client
Beispiel #21
0
    def __init__(self, corpus, target=None, **kwargs):
        """
        The corpus is the `HTMLCorpusReader` to preprocess and pickle.
        The target is the directory on disk to output the pickled corpus to.
        """
        self.corpus = corpus
        self.target = target
        self.tagger = pos_tagger('spacy')

        # Modification for dibutade
        if model == 'stanford':
            os.environ[
                'CORENLP_HOME'] = 'C:/Users/alain/OneDrive/Ateliers Dibutade/NLP/stanford-corenlp-full-2018-10-05'
            self.pos_tagger = CoreNLPClient(properties='french',
                                            annotators=[
                                                'pos',
                                            ],
                                            timeout=30000,
                                            memory='1G')
        elif model == 'spacy':
            self.nlp = spacy.load('fr_core_news_sm')
Beispiel #22
0
def runClient(text):
    print('---')
    print('starting up Java Stanford CoreNLP Server...')
    # set up the client
    #with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','natlog','openie'], properties={"outputFormat": "json","openie.triple.strict":"true","splitter.disable" : "true","openie.max_entailments_per_clause":"1"}, be_quiet=False, timeout=30000, memory='16G') as client:
    with CoreNLPClient(annotators=[
            'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'natlog', 'openie'
    ],
                       be_quiet=True,
                       timeout=30000,
                       memory='16G') as client:
        # submit the request to the server
        # Iterate over all tokens in all sentences, and print out the word, lemma, pos and ner tags
        text = "Trump is the President of America"
        document = client.annotate(text)
        output = client.annotate(text,
                                 properties={
                                     "outputFormat": "json",
                                     "openie.triple.strict": "true",
                                     "splitter.disable": "true",
                                     "openie.max_entailments_per_clause": "1"
                                 })
        #print(output)
        result = [output["sentences"][0]["openie"] for item in output]
        print(result)
        print(result[0][0]["subject"])
        # [[{'subject': 'John', 'subjectSpan': [0, 1], 'relation': 'jumps over', 'relationSpan': [1, 3], 'object': 'fox', 'objectSpan': [4, 5]}]]
        for i in result:
            for rel in i:
                relationSent = rel['relation'], rel['subject'], rel['object']
                print(relationSent)
        # print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER"))
        tags = getTags()
        # #print(document.sentence)
        props = []
        for key, value in enumerate(document.sentence):
            for t in value.token:
                props.append({
                    "word": t.word,
                    "lemma": t.lemma,
                    "pos": t.pos,
                    "pos_full": tags[t.pos],
                    "ner": t.ner
                })
                #print("Word: {}, Lemma: {}, POS: {}, NER: {}".format(t.word,t.lemma,tags[t.pos],t.ner))

        replaceWord = getAnalysis(props)
        if replaceWord["replace"] and replaceWord["type"] == "PERSON":
            text = text.replace(replaceWord["wordToReplace"], "who") + " ?"
            print(text)
        if replaceWord["replace"] and replaceWord["type"] == "LOCATION":
            #text = ''.join("Where is ",relationSent["subject"]," ?")
            print("Where is {} ?".format(replaceWord["wordToReplace"]))
Beispiel #23
0
def _parse_segmenttokenize_en(document, usage='experiments'):
    """
	Given a document, in str format, containing one or more sentences, returns a set of segmented 
	and tokenized strings, with indexing information. This format is the basis for: (i) the format 
	for storing information on sentences and tokens in the CoNLL 2015 and 2016 Shared Task; 
	(ii) the .Words attribute in Parse-class objects. This function uses the stanford-corenlp 
	package and requires the CoreNLP Java package to be downloaded and built (with Ant or Maven) 
	and saved to the 04_utils folder. 
	"""
    cwd = getcwd()
    if usage == 'production' or 'experiments':
        version = 'stanford-corenlp-full-2018-10-05/'

        corenlp_path = re.findall(r'\S*/marta-v2',
                                  cwd)[0] + '/04_utils/' + version
        environ["CORENLP_HOME"] = corenlp_path

        with CoreNLPClient(
                annotators="tokenize ssplit pos".split(),
                memory='1G',
                be_quiet=True,
                max_char_length=100000,
        ) as client:
            annotated = client.annotate(document, output_format='json')
        client.stop()

    elif usage == 'experiments_352':
        version = 'stanford-corenlp-full-2018-10-05/'  #-2015-04-20/'
        corenlp_path = re.findall(r'\S*/marta-v2',
                                  cwd)[0] + '/04_utils/' + version
        chdir(corenlp_path)
        args = [
            "*", '-Xmx500m', 'edu.stanford.nlp.pipeline.StanfordCoreNLP',
            '-annotators', 'tokenize,ssplit,pos', '-tokenize.whitespace',
            '-ssplit.eolonly', '-outputFormat', 'json', '-maxLength', '10000'
        ]  # necessary to set -maxLength (default is only 200); neccessary to
        # specify -tokenize.whitespace, since our sentence is joined from already-tokenized.
        process = subprocess.Popen(['java', '-cp'] + args,
                                   stdout=subprocess.PIPE,
                                   stdin=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)

        annotated, error = process.communicate(input=document.encode('utf-8'))
        chdir(
            cwd
        )  # setting the current working directory back to original, else it causes errors downstream

        # convert byte to utf-8
        annotated = annotated.decode('utf-8')

        # extract the parse sections

    return annotated
Beispiel #24
0
    def execute_stanford_analysis():
        try:
            os.environ[
                "CORENLP_HOME"] = "C:\\Users\\wenga\\OneDrive - Berner Fachhochschule\\CAS PML\\90_Projektarbeit\\devNG\\_data\\stanford-corenlp-full-2018-10-05"
            print("Downloading english dictionary")
            # stanfordnlp.download('en', force=True)
            print('---')
            print('starting up Java Stanford CoreNLP Server...')
            # en_nlp = stanfordnlp.Pipeline(lang='en')
            # Processing English text
            # en_doc = en_nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
            text = "your rich bitch. Yep. I mean it. RIACH BIATCH. F**k you you f*****g idiot piece of shit. I hate you. Die m**********r."
            # set up the client
            # with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref'],
            #                   timeout=60000, memory='16G') as client:
            with CoreNLPClient(annotators=['sentiment'],
                               timeout=10000,
                               memory='20G') as client:

                tweets = load_twitter_data(5000000)
                x = 1
                result = []
                for singleTweet in tweets:
                    # ann = client.annotate(singleTweet[1], properties={'annotators': 'sentiment', 'outputFormat': 'json'})
                    ann = client.annotate(singleTweet[1],
                                          properties={
                                              'annotators': 'sentiment',
                                              'outputFormat': 'json'
                                          })
                    neg = 0
                    neu = 0
                    pos = 0
                    for sentence in ann['sentences']:
                        if sentence['sentimentValue'] == "1":
                            neg += 1
                        elif sentence['sentimentValue'] == "2":
                            neu += 1
                        else:
                            pos += 1

                    result.append((singleTweet[0], neg / len(ann['sentences']),
                                   neu / len(ann['sentences']),
                                   pos / len(ann['sentences'])))

                    if x % 1000 == 0:
                        print("[{}] Sentimented {}/{} tweets".format(
                            datetime.now(), x, len(tweets)))
                    x += 1
        except Error as e:
            print("SQL error: {}".format(e))
        except:
            print("Unexpected error: {}".format(sys.exc_info()[0]))
Beispiel #25
0
    def __init__(self, core_nlp_version: str = '2018-10-05', annotators=None):
        if annotators is None or not isinstance(annotators, list):
            annotators = ['openie', 'dcoref']
        self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format(
            core_nlp_version)
        self.install_dir = Path('~/.stanfordnlp_resources/').expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if not (self.install_dir / Path(
                'stanford-corenlp-full-{}'.format(core_nlp_version))).exists():
            print('Downloading from %s.' % self.remote_url)
            output_filename = wget.download(self.remote_url,
                                            out=str(self.install_dir))
            print('\nExtracting to %s.' % self.install_dir)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()

        os.environ['CORENLP_HOME'] = str(self.install_dir /
                                         'stanford-corenlp-full-2018-10-05')
        from stanfordnlp.server import CoreNLPClient
        self.client = CoreNLPClient(annotators=annotators, memory='8G')
        self.parser = CoreNLPParser()
Beispiel #26
0
def demo_test(replace_like=False):
    # ['tokenize','ssplit','pos','lemma','ner','parse','depparse','coref']
    # text = "A cat in a cup is like a dog in a bucket."
    # text = "Rumor of a big battle spread like a grassfire up the valley." # This one doesn't parse correctly.
    # text = "When the sun came out, Stevie strode proudly into Orange Square," \
    #       "smiling like a landlord on industrious tenants. A cat is like a dog."
    text = ''
    text_big = '''and yet like a child among adults .
    I don't mean a few aesthetes who play about with sensations , like a young prince in a miniature dabbling his hand in a pool .
    Oh , he was being queer and careful , pawing about in the drawer and holding the bottle like a snake at the length of his arm .
    `` I went to the city And there I did Weep , Men a-crowing like asses , And living like sheep .
    Rumor of a big battle spread like a grassfire up the valley .
    When the sun came out , Stevie strode proudly into Orange Square , smiling like a landlord on industrious tenants .
    They gave the room a strange note of incongruity , like a mole on a beautiful face .
    It always came on , faithfully , just like a radio or juke box , whenever he started to worry too much about something , when the bad things tried to push their way into him .
    The design of a mechanical interlocking frame is much like a mechanical puzzle , but once understood , the principles can be applied to any track and signal arrangement .
    The sticks fell like a shower around her and she felt them sting her flesh and send tiny points of pain along her thighs .
    I saw the pony fall like a stone and the young warrior flew over its head , bouncing like a rubber ball .
    '''
    text += '''This dog is analogous to an atom.'''

    with CoreNLPClient(annotators=[
            'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse',
            'coref'
    ],
                       timeout=60000,
                       memory='4G',
                       be_quiet=True) as client:
        print("##########-----About to annotate...-----")
        # If we want to replace like, that needs to happen here. Let's not do that, however.
        ann = client.annotate(text)
        sen = ann.sentence[0]
        token = sen.token[0]
        print("*(((((")
        print(token.word)
        # sentence is a Sentence. Where and how is this defined?
        for sentence in ann.sentence:
            if replace_like:
                replace_with_like(sentence, signals, "like")
            for token in sentence.token:
                print(token.word, end=' ')
            print()

            constituency_parse = sentence.parseTree

            my_parse = CoreNLPNode(constituency_parse)
            my_parse.create_tree()
            my_parse.thematic_search()
            print("BASE: ", my_parse.roles["base"], "TARGET: ",
                  my_parse.roles["target"], "ACTION: ",
                  my_parse.roles["action"])
Beispiel #27
0
    def __init__(self):
        with Timer() as self.model_load_time:
            os.environ["CORENLP_HOME"] = CORENLP_HOME
            from stanfordnlp.server import CoreNLPClient

            client = CoreNLPClient(
                annotators=["tokenize", "ssplit"],
                timeout=30000,
                memory="2G",
                properties={
                    "tokenize.language": "de",
                    "outputFormat": "text"
                },
            )
            self.processor = client.annotate
Beispiel #28
0
def syntactic_parse_texts(
    texts: List[str],
    tokenize=False,
    sentence_split=False,
    verbose=False,
    with_constituency_parse=False,
):
    corenlp_annotators = [
        "tokenize",
        "ssplit",
        "pos",
        "lemma",
        "ner",
        "depparse",
    ]
    if with_constituency_parse:
        corenlp_annotators.append("parse")
    annotators_properties = {
        "tokenize.whitespace": not tokenize,
        "ssplit.eolonly": not sentence_split,
        "depparse.model": "edu/stanford/nlp/models/parser/nndep/english_SD.gz",
        "outputFormat": "json",
    }
    if not STANFORD_CORENLP_DIR.exists():
        download_stanford_corenlp()
    os.environ["CORENLP_HOME"] = str(STANFORD_CORENLP_DIR)

    parse_results = []

    with CoreNLPClient(
            annotators=corenlp_annotators,
            properties=annotators_properties,
            threads=40,
    ) as client:
        for text in tqdm(texts, disable=(not verbose)):
            if isinstance(text, List):
                text = " ".join(text)
            raw_parse_result = client.annotate(text)
            parse_result = format_parser_output(raw_parse_result["sentences"])

            if len(parse_result["sentences"]) > 1 and not sentence_split:
                parse_result = join_parse_result(parse_result)
            elif sentence_split:
                parse_result = split_parse_result(parse_result["sentences"])

            parse_results.append(parse_result)

    return parse_results
Beispiel #29
0
def noun_adjective_pairer(reviews_per_business):
    pair_list = []
    with CoreNLPClient(
            annotators=["tokenize", "ssplit", "pos", "depparse", "lemma"],
            timeout=120000,
            memory="5G",
    ) as client:
        for review in reviews_per_business:
            ann = client.annotate(review)
            for sentence in ann.sentence:
                dependency_parse = sentence.basicDependencies
                tokens = sentence.token

                predicted_heads_and_dependencies = {}
                predicted_pos = []
                predicted_lemm = []
                for i in range(len(tokens)):
                    predicted_pos.append(tokens[i].pos)
                    predicted_lemm.append(tokens[i].lemma)

                for i in range(len(dependency_parse.edge)):
                    source = dependency_parse.edge[i].source
                    target = dependency_parse.edge[i].target
                    dep = dependency_parse.edge[i].dep
                    head_pos = predicted_pos[source - 1]
                    if target - 1 in predicted_heads_and_dependencies:
                        predicted_heads_and_dependencies_list = predicted_heads_and_dependencies[
                            target - 1]
                        predicted_heads_and_dependencies_list.append(
                            (source - 1, dep, head_pos))
                    else:
                        predicted_heads_and_dependencies[target - 1] = [
                            (source - 1, dep, head_pos)
                        ]
                noun_pairs = get_noun_pairs_index(
                    predicted_heads_and_dependencies)
                adjective_pairs = get_adjective_pairs_index(
                    predicted_heads_and_dependencies)
                noun_adjective_pairs = get_noun_adjective_pairs(
                    predicted_heads_and_dependencies,
                    predicted_pos,
                    predicted_lemm,
                    noun_pairs,
                    adjective_pairs,
                )
                pair_list.extend(noun_adjective_pairs)

    return pair_list
Beispiel #30
0
def pipeline1(text, r, t):
    extractedRelations = []
    with CoreNLPClient(
            annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'],
            timeout=450000,
            memory='4G',
            endpoint="http://localhost:9000",
            threads=7) as pipeline1:
        print(
            "\tAnnotating the webpage using [tokenize, ssplit, pos, lemma, ner] annotators ..."
        )
        ann = pipeline1.annotate(text)
        sentenceNumber = len(ann.sentence)
        namedEntity = patterns[toRelation[r]]
        print(
            "\tExtracted %d sentences. Processing each sentence one by one to check for presence of right pair of named entity types; if so, will run the second pipeline ..."
            % sentenceNumber)

        # if a sentence has the targeted two named entities
        # add the sentence to the list that the element of which perform extracting kbp annotations
        processedSentence = []
        for i, sentence in enumerate(ann.sentence):
            # check if those named entity in the query all appear in the extract sentence
            firstEntity = False
            secondEntity = False
            for token in sentence.token:
                if toRelation[r] == relation[2]:
                    if token.ner == namedEntity[0]:
                        firstEntity = True
                    if token.ner in namedEntity[1]:
                        secondEntity = True
                else:
                    if token.ner == namedEntity[0]:
                        firstEntity = True
                    if token.ner == namedEntity[1]:
                        secondEntity = True

            # if both targeted named entity appear, the sentence adds to the list
            if firstEntity and secondEntity:
                processedSentence.append([i, to_text(sentence)])

        # extract the relations in the list of sentence through pipeline2
        extractedRelations += pipeline2(processedSentence, t)
        print("Extracted kbp annotations for %d out of total %d sentences" %
              (len(processedSentence), sentenceNumber))

    return extractedRelations