Python tag Exemples, nltk_contrib.timex.tag Python Exemples

Exemple #1

0

Afficher le fichier

    def timexTagText(self, altText=None):
        """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text

        Args:
            None
            
        Returns:
            tagged text (str)
        
        """
        """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary"""
        if altText is not None:
            raw = altText
            altOutput = timex.tag(raw)
            return altOutput

        else:
            """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned"""
            self.file = open(self.filename)
            raw = self.file.read()
            if self.textList.get('timexTagText') is None:
                self.textList['timexTagText'] = timex.tag(raw)

            self.file.close()

        return self.textList.get('timexTagText')

Exemple #2

0

Afficher le fichier

def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    if Verbose:
        echo2("Incoming content is " + content)
    start = time.time()
    date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    names.extend(date_time)
    result = {"result": "success", "names": names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged), 'unit')
        result['units'] = units
    jsonDoc = json.dumps(result,
                         sort_keys=True,
                         indent=4,
                         separators=(',', ': '))
    end = time.time()
    print "NER took " + str(end - start) + " seconds"
    return jsonDoc

Exemple #3

0

Afficher le fichier

Fichier : temporal_enrich.py Projet : ruthvik947/solr-enrich

def extract_temporal_from_doc(doc):
    enriched = copy.deepcopy(doc)
    extracted_date_list = []
    master_list = []

    if '_version_' in enriched.keys():
        del enriched['_version_']
    if 'boost' in enriched.keys():
        del enriched['boost']

    if 'content' in doc.keys():
        master_list += doc['content']
    if 'title' in doc.keys():
        master_list += doc['title']

    for content in master_list:
        temporal_tags = timex.tag(content)
        for tag in temporal_tags:
            if tag not in extracted_date_list:

                if tag.isdigit() and len(tag) == 4:
                    # date_list = get year month day as list
                    datestring_tag = get_date_string_from_ymd(year=int(tag))
                    extracted_date_list.append(datestring_tag)

                # TODO: don't do anything for now because Solr doesn't accept non formatted
                # date strings (so make function to get a date format for the string tag)
                # else:
                    # extracted_date_list.append(tag)

    if (len(extracted_date_list)):
        enriched['extracted_dates'] = extracted_date_list

    return enriched

Exemple #4

0

Afficher le fichier

Fichier : text_search.py Projet : summera/python-natural-language-search

 def __init__(self, query_string):
     self.query_string = query_string
     sentences = nltk.sent_tokenize(query_string)
     self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences]
     self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True)
     self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False)
     self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())

Exemple #5

0

Afficher le fichier

Fichier : Preprocessor.py Projet : tkakar/FDA-Textmining

    def timexTagText(self, altText=None):
        """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text

        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
            
        Returns:
            tagged text (str)
        
        """
        """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary"""
        if altText is not None:
            raw = altText
            altOutput = timex.tag(raw)
            return altOutput

        else:
            """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned"""
            raw = self.rawText()
            if Preprocessor.textList.get('timexTagText') is None:
                Preprocessor.textList['timexTagText'] = timex.tag(raw)

        return Preprocessor.textList.get('timexTagText')

Exemple #6

0

Afficher le fichier

Fichier : text_search.py Projet : mikekiwa/python-natural-language-search

 def __init__(self, query_string):
     self.query_string = query_string
     sentences = nltk.sent_tokenize(query_string)
     self.tokenized_sentences = [
         nltk.word_tokenize(sentence) for sentence in sentences
     ]
     self.tagged_sentences = [
         nltk.pos_tag(sentence) for sentence in self.tokenized_sentences
     ]
     self.binary_chunked_sentences = nltk.batch_ne_chunk(
         self.tagged_sentences, binary=True)
     self.multiclass_chunked_sentences = nltk.batch_ne_chunk(
         self.tagged_sentences, binary=False)
     self.temporal_sentences = timex.ground(timex.tag(query_string),
                                            mx.DateTime.gmt())

Exemple #7

0

Afficher le fichier

Fichier : server.py Projet : jung-jung-yeh/NLTKRest

def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    if Verbose:
        echo2("Incoming content is "+content)
    start = time.time()
    date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt)
    names.extend(date_time)
    result = {"result" : "success", "names" : names}
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc

Exemple #8

0

Afficher le fichier

Fichier : server.py Projet : Kaspect/enrich

def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    if Verbose:
        echo2("Incoming content is "+content)
    start = time.time()
    date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    names.extend(date_time)
    result = {"result" : "success", "names" : names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged),'unit')
        result['units'] = units
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc