Ejemplo n.º 1
0
    def timexTagText(self, altText=None):
        """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text

        Args:
            None
            
        Returns:
            tagged text (str)
        
        """
        """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary"""
        if altText is not None:
            raw = altText
            altOutput = timex.tag(raw)
            return altOutput

        else:
            """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned"""
            self.file = open(self.filename)
            raw = self.file.read()
            if self.textList.get('timexTagText') is None:
                self.textList['timexTagText'] = timex.tag(raw)

            self.file.close()

        return self.textList.get('timexTagText')
Ejemplo n.º 2
0
def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    if Verbose:
        echo2("Incoming content is " + content)
    start = time.time()
    date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    names.extend(date_time)
    result = {"result": "success", "names": names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged), 'unit')
        result['units'] = units
    jsonDoc = json.dumps(result,
                         sort_keys=True,
                         indent=4,
                         separators=(',', ': '))
    end = time.time()
    print "NER took " + str(end - start) + " seconds"
    return jsonDoc
Ejemplo n.º 3
0
def extract_temporal_from_doc(doc):
    enriched = copy.deepcopy(doc)
    extracted_date_list = []
    master_list = []

    if '_version_' in enriched.keys():
        del enriched['_version_']
    if 'boost' in enriched.keys():
        del enriched['boost']

    if 'content' in doc.keys():
        master_list += doc['content']
    if 'title' in doc.keys():
        master_list += doc['title']

    for content in master_list:
        temporal_tags = timex.tag(content)
        for tag in temporal_tags:
            if tag not in extracted_date_list:

                if tag.isdigit() and len(tag) == 4:
                    # date_list = get year month day as list
                    datestring_tag = get_date_string_from_ymd(year=int(tag))
                    extracted_date_list.append(datestring_tag)

                # TODO: don't do anything for now because Solr doesn't accept non formatted
                # date strings (so make function to get a date format for the string tag)
                # else:
                    # extracted_date_list.append(tag)

    if (len(extracted_date_list)):
        enriched['extracted_dates'] = extracted_date_list

    return enriched
 def __init__(self, query_string):
     self.query_string = query_string
     sentences = nltk.sent_tokenize(query_string)
     self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences]
     self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True)
     self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False)
     self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
Ejemplo n.º 5
0
    def timexTagText(self, altText=None):
        """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text

        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
            
        Returns:
            tagged text (str)
        
        """
        """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary"""
        if altText is not None:
            raw = altText
            altOutput = timex.tag(raw)
            return altOutput

        else:
            """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned"""
            raw = self.rawText()
            if Preprocessor.textList.get('timexTagText') is None:
                Preprocessor.textList['timexTagText'] = timex.tag(raw)

        return Preprocessor.textList.get('timexTagText')
 def __init__(self, query_string):
     self.query_string = query_string
     sentences = nltk.sent_tokenize(query_string)
     self.tokenized_sentences = [
         nltk.word_tokenize(sentence) for sentence in sentences
     ]
     self.tagged_sentences = [
         nltk.pos_tag(sentence) for sentence in self.tokenized_sentences
     ]
     self.binary_chunked_sentences = nltk.batch_ne_chunk(
         self.tagged_sentences, binary=True)
     self.multiclass_chunked_sentences = nltk.batch_ne_chunk(
         self.tagged_sentences, binary=False)
     self.temporal_sentences = timex.ground(timex.tag(query_string),
                                            mx.DateTime.gmt())
Ejemplo n.º 7
0
def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    if Verbose:
        echo2("Incoming content is "+content)
    start = time.time()
    date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt)
    names.extend(date_time)
    result = {"result" : "success", "names" : names}
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc
Ejemplo n.º 8
0
def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    if Verbose:
        echo2("Incoming content is "+content)
    start = time.time()
    date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    names.extend(date_time)
    result = {"result" : "success", "names" : names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged),'unit')
        result['units'] = units
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc