def timexTagText(self, altText=None): """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text Args: None Returns: tagged text (str) """ """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary""" if altText is not None: raw = altText altOutput = timex.tag(raw) return altOutput else: """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned""" self.file = open(self.filename) raw = self.file.read() if self.textList.get('timexTagText') is None: self.textList['timexTagText'] = timex.tag(raw) self.file.close() return self.textList.get('timexTagText')
def namedEntityRecognizer(): echo2("Performing NER on incoming stream") content = request.stream.read() if Verbose: echo2("Incoming content is " + content) start = time.time() date_time = timex.tag(content) tokenized = nltk.word_tokenize(content) tagged = nltk.pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) names = extract_entity_names(namedEnt, 'NE') names.extend(date_time) result = {"result": "success", "names": names} if Units: grammar = '''unit: {<CD><NNS>?<NN.*>?}, unit: {<CD><JJ>?<NN.*>} ''' parser = nltk.RegexpParser(grammar) units = extract_entity_names(parser.parse(tagged), 'unit') result['units'] = units jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': ')) end = time.time() print "NER took " + str(end - start) + " seconds" return jsonDoc
def extract_temporal_from_doc(doc): enriched = copy.deepcopy(doc) extracted_date_list = [] master_list = [] if '_version_' in enriched.keys(): del enriched['_version_'] if 'boost' in enriched.keys(): del enriched['boost'] if 'content' in doc.keys(): master_list += doc['content'] if 'title' in doc.keys(): master_list += doc['title'] for content in master_list: temporal_tags = timex.tag(content) for tag in temporal_tags: if tag not in extracted_date_list: if tag.isdigit() and len(tag) == 4: # date_list = get year month day as list datestring_tag = get_date_string_from_ymd(year=int(tag)) extracted_date_list.append(datestring_tag) # TODO: don't do anything for now because Solr doesn't accept non formatted # date strings (so make function to get a date format for the string tag) # else: # extracted_date_list.append(tag) if (len(extracted_date_list)): enriched['extracted_dates'] = extracted_date_list return enriched
def __init__(self, query_string): self.query_string = query_string sentences = nltk.sent_tokenize(query_string) self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences] self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True) self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False) self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
def timexTagText(self, altText=None): """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: tagged text (str) """ """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary""" if altText is not None: raw = altText altOutput = timex.tag(raw) return altOutput else: """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned""" raw = self.rawText() if Preprocessor.textList.get('timexTagText') is None: Preprocessor.textList['timexTagText'] = timex.tag(raw) return Preprocessor.textList.get('timexTagText')
def __init__(self, query_string): self.query_string = query_string sentences = nltk.sent_tokenize(query_string) self.tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] self.tagged_sentences = [ nltk.pos_tag(sentence) for sentence in self.tokenized_sentences ] self.binary_chunked_sentences = nltk.batch_ne_chunk( self.tagged_sentences, binary=True) self.multiclass_chunked_sentences = nltk.batch_ne_chunk( self.tagged_sentences, binary=False) self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
def namedEntityRecognizer(): echo2("Performing NER on incoming stream") content = request.stream.read() if Verbose: echo2("Incoming content is "+content) start = time.time() date_time = timex.tag(content) tokenized = nltk.word_tokenize(content) tagged = nltk.pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) names = extract_entity_names(namedEnt) names.extend(date_time) result = {"result" : "success", "names" : names} jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': ')) end = time.time() print "NER took "+str(end - start)+" seconds" return jsonDoc
def namedEntityRecognizer(): echo2("Performing NER on incoming stream") content = request.stream.read() if Verbose: echo2("Incoming content is "+content) start = time.time() date_time = timex.tag(content) tokenized = nltk.word_tokenize(content) tagged = nltk.pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) names = extract_entity_names(namedEnt, 'NE') names.extend(date_time) result = {"result" : "success", "names" : names} if Units: grammar = '''unit: {<CD><NNS>?<NN.*>?}, unit: {<CD><JJ>?<NN.*>} ''' parser = nltk.RegexpParser(grammar) units = extract_entity_names(parser.parse(tagged),'unit') result['units'] = units jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': ')) end = time.time() print "NER took "+str(end - start)+" seconds" return jsonDoc