class RuTokenizer(Tokenizer): def __init__(self, **kwargs): """ Args: annotators: set that can include pos, lemma, and ner. """ self.annotators = copy.deepcopy(kwargs.get('annotators', set())) self.include_pos = {'pos'} & self.annotators self.include_lemma = {'lemma'} & self.annotators self.include_ner = {'ner'} & self.annotators self.morph = pymorphy2.MorphAnalyzer() self.wt = WhitespaceTokenizer() self.rt = RegexpTokenizer(r'\w+') def __call__(self, text): # We don't treat new lines as tokens. clean_text = text.replace('\n', ' ') # remove punctuation clean_text = ' '.join(self.rt.tokenize(clean_text)) # split by whitespaces and get spans spans = list(self.wt.span_tokenize(clean_text)) n = len(spans) data = [] for i in range(n): start_idx, end_idx = spans[i] token = clean_text[start_idx:end_idx] start_ws = start_idx if i + 1 < n: end_ws = spans[i + 1][0] else: end_ws = start_idx + len(token) token_ws = clean_text[start_ws:end_ws] lemma, pos, ent_type = '', '', '' if self.include_pos or self.include_lemma: p = self.morph.parse(token)[0] if self.include_lemma: lemma = p.normal_form if self.include_pos: pos = p.tag.POS if self.include_ner: entities = Text(token, hint_language_code='ru').entities if len(entities): ent_type = entities[0].tag data.append((token, token_ws, spans[i], pos, lemma, ent_type)) return Tokens(data, self.annotators, opts={'non_ent': ''})
class LingPipeParser(object): def __init__(self, config): self.clear() self.config = config def clear(self): self.tok_num = 0 self.byte_idx = 0 self.line_idx = 0 self.word_tokenizer = WhitespaceTokenizer() def set(self, ner_dom): self.clear() ## nltk wants a unicode string, so decode, it and then we will ## re-encode it to carefully recover the byte offsets. We ## must take care not to use any nltk components that insert ## new whitespace, such ## nltk.tokenize.treebank.TreebankTokenizer self.ner_dom = ner_dom self.attributes = [] self.relations = [] def sentences(self): ''' Iterate over <s> XML-like tags and tokenize with nltk ''' for sentence_id, node in enumerate(self.ner_dom.childNodes): ## increment the char index with any text before the <s> ## tag. Crucial assumption here is that the LingPipe XML ## tags are inserted into the original byte array without ## modifying the portions that are not inside the ## LingPipe-added tags themselves. if node.nodeType == node.TEXT_NODE: ## we expect to only see TEXT_NODE instances with whitespace assert only_whitespace.match(node.data), repr(node.data) ## must convert back to utf-8 to have expected byte offsets self.byte_idx += len(node.data.encode('utf-8')) ## count full lines, i.e. only those that end with a \n # 'True' here means keep the trailing newlines for line in node.data.splitlines(True): if line.endswith('\n'): self.line_idx += 1 else: logger.debug('getting tokens for sentence_id=%d' % sentence_id) more_sentence_remains = True while more_sentence_remains: ## always a sentence sent = Sentence() ## this "node" came from for loop above, and it's ## childNodes list might have been popped by a ## previous pass through this while loop tokens = iter( self.tokens( node ) ) while 1: try: tok = tokens.next() sent.tokens.append(tok) #logger.debug('got token: %r %d %d' % (tok.token, tok.mention_id, tok.sentence_pos)) except StopIteration: yield sent more_sentence_remains = False break def _make_token(self, start, end): ''' Instantiates a Token from self._input_string[start:end] ''' ## all thfift strings must be encoded first tok_string = self._input_string[start:end].encode('utf-8') if only_whitespace.match(tok_string): ## drop any tokens with only whitespace return None tok = Token() tok.token = tok_string tok.token_num = self.tok_num if 'BYTES' in self.config['offset_types']: tok.offsets[OffsetType.BYTES] = Offset( type = OffsetType.BYTES, first=self.byte_idx + len(self._input_string[:start].encode('utf-8')), length=len(tok_string), value=self.config['offset_debugging'] and tok_string or None, ) if 'LINES' in self.config['offset_types']: tok.offsets[OffsetType.LINES] = Offset( type = OffsetType.LINES, first=self.line_idx, length=1, value=self.config['offset_debugging'] and tok_string or None, ) self.tok_num += 1 ## keep track of position within a sentence tok.sentence_pos = self.sent_pos self.sent_pos += 1 return tok def tokens(self, sentence_dom): ''' Tokenize all the words and preserve NER labels from ENAMEX tags ''' ## keep track of sentence position, which is reset for each ## sentence, and used above in _make_token self.sent_pos = 0 ## keep track of mention_id, so we can distinguish adjacent ## multi-token mentions within the same coref chain mention_id = 0 while len(sentence_dom.childNodes) > 0: ## shrink the sentence_dom's child nodes. In v0_2_0 this ## was required to cope with HitMaxi16. Now it is just to ## save memory. node = sentence_dom.childNodes.pop(0) if node.nodeType == node.TEXT_NODE: ## process portion before an ENAMEX tag for line in node.data.splitlines(True): self._input_string = line for start, end in self.word_tokenizer.span_tokenize(line): tok = self._make_token(start, end) if tok: yield tok if line.endswith('\n'): ## maintain the index to the current line self.line_idx += 1 ## increment index pasat the 'before' portion self.byte_idx += len(line.encode('utf-8')) else: ## process text inside an ENAMEX tag assert node.nodeName == 'ENAMEX', node.nodeName chain_id = node.attributes.get('ID').value entity_type = node.attributes.get('TYPE').value for node in node.childNodes: assert node.nodeType == node.TEXT_NODE, node.nodeType for line in node.data.splitlines(True): self._input_string = line for start, end in self.word_tokenizer.span_tokenize(line): tok = self._make_token(start, end) if tok: if entity_type in _PRONOUNS: tok.mention_type = MentionType.PRO tok.entity_type = _ENTITY_TYPES[entity_type] ## create an attribute attr = Attribute( attribute_type=AttributeType.PER_GENDER, value=str(_PRONOUNS[entity_type]) ) self.attributes.append(attr) else: ## regular entity_type tok.mention_type = MentionType.NAME tok.entity_type = _ENTITY_TYPES[entity_type] tok.equiv_id = int(chain_id) tok.mention_id = mention_id yield tok if line.endswith('\n'): ## maintain the index to the current line self.line_idx += 1 ## increment index pasat the 'before' portion self.byte_idx += len(line.encode('utf-8')) ## increment mention_id within this sentence mention_id += 1
class nltk_tokenizer(IncrementalTransform): ''' a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers ''' config_name = 'nltk_tokenizer' tagger_id = 'nltk_tokenizer' def __init__(self, *args, **kwargs): super(nltk_tokenizer, self).__init__(*args, **kwargs) self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer() def _sentences(self, clean_visible): 'generate strings identified as sentences' previous_end = 0 clean_visible = clean_visible.decode('utf8') assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): 'make a sortedcollection on body.labels' labels = stream_item.body.labels.get(self.config.get('annotator_id')) if not labels: labels = [] self.label_index = SortedCollection( labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences( stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode('utf8') except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str)) tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences
class LingPipeParser(object): def __init__(self, config): self.config = config self.clear() def clear(self): self.tok_num = 0 self.byte_idx = 0 self.line_idx = 0 self.word_tokenizer = WhitespaceTokenizer() def set(self, ner_dom): self.clear() ## nltk wants a unicode string, so decode, it and then we will ## re-encode it to carefully recover the byte offsets. We ## must take care not to use any nltk components that insert ## new whitespace, such ## nltk.tokenize.treebank.TreebankTokenizer self.ner_dom = ner_dom self.attributes = [] self.relations = [] def sentences(self): ''' Iterate over <s> XML-like tags and tokenize with nltk ''' for sentence_id, node in enumerate(self.ner_dom.childNodes): ## increment the char index with any text before the <s> ## tag. Crucial assumption here is that the LingPipe XML ## tags are inserted into the original byte array without ## modifying the portions that are not inside the ## LingPipe-added tags themselves. if node.nodeType == node.TEXT_NODE: ## we expect to only see TEXT_NODE instances with whitespace assert only_whitespace.match(node.data), repr(node.data) ## must convert back to utf-8 to have expected byte offsets self.byte_idx += len(node.data.encode('utf-8')) ## count full lines, i.e. only those that end with a \n # 'True' here means keep the trailing newlines for line in node.data.splitlines(True): if line.endswith('\n'): self.line_idx += 1 else: logger.debug('getting tokens for sentence_id=%d' % sentence_id) more_sentence_remains = True while more_sentence_remains: ## always a sentence sent = Sentence() ## this "node" came from for loop above, and it's ## childNodes list might have been popped by a ## previous pass through this while loop tokens = iter(self.tokens(node)) while 1: try: tok = tokens.next() sent.tokens.append(tok) #logger.debug('got token: %r %d %d' % (tok.token, tok.mention_id, tok.sentence_pos)) except StopIteration: yield sent more_sentence_remains = False break def _make_token(self, start, end): ''' Instantiates a Token from self._input_string[start:end] ''' ## all thfift strings must be encoded first tok_string = self._input_string[start:end].encode('utf-8') if only_whitespace.match(tok_string): ## drop any tokens with only whitespace return None tok = Token() tok.token = tok_string tok.token_num = self.tok_num if 'BYTES' in self.config['offset_types']: tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=self.byte_idx + len(self._input_string[:start].encode('utf-8')), length=len(tok_string), value=self.config['offset_debugging'] and tok_string or None, ) if 'LINES' in self.config['offset_types']: tok.offsets[OffsetType.LINES] = Offset( type=OffsetType.LINES, first=self.line_idx, length=1, value=self.config['offset_debugging'] and tok_string or None, ) self.tok_num += 1 ## keep track of position within a sentence tok.sentence_pos = self.sent_pos self.sent_pos += 1 return tok def tokens(self, sentence_dom): ''' Tokenize all the words and preserve NER labels from ENAMEX tags ''' ## keep track of sentence position, which is reset for each ## sentence, and used above in _make_token self.sent_pos = 0 ## keep track of mention_id, so we can distinguish adjacent ## multi-token mentions within the same coref chain mention_id = 0 while len(sentence_dom.childNodes) > 0: ## shrink the sentence_dom's child nodes. In v0_2_0 this ## was required to cope with HitMaxi16. Now it is just to ## save memory. node = sentence_dom.childNodes.pop(0) if node.nodeType == node.TEXT_NODE: ## process portion before an ENAMEX tag for line in node.data.splitlines(True): self._input_string = line for start, end in self.word_tokenizer.span_tokenize(line): tok = self._make_token(start, end) if tok: yield tok if line.endswith('\n'): ## maintain the index to the current line self.line_idx += 1 ## increment index pasat the 'before' portion self.byte_idx += len(line.encode('utf-8')) else: ## process text inside an ENAMEX tag assert node.nodeName == 'ENAMEX', node.nodeName chain_id = node.attributes.get('ID').value entity_type = node.attributes.get('TYPE').value for node in node.childNodes: assert node.nodeType == node.TEXT_NODE, node.nodeType for line in node.data.splitlines(True): self._input_string = line for start, end in self.word_tokenizer.span_tokenize( line): tok = self._make_token(start, end) if tok: if entity_type in _PRONOUNS: tok.mention_type = MentionType.PRO tok.entity_type = _ENTITY_TYPES[ entity_type] ## create an attribute attr = Attribute( attribute_type=AttributeType. PER_GENDER, value=str(_PRONOUNS[entity_type])) self.attributes.append(attr) else: ## regular entity_type tok.mention_type = MentionType.NAME tok.entity_type = _ENTITY_TYPES[ entity_type] tok.equiv_id = int(chain_id) tok.mention_id = mention_id yield tok if line.endswith('\n'): ## maintain the index to the current line self.line_idx += 1 ## increment index pasat the 'before' portion self.byte_idx += len(line.encode('utf-8')) ## increment mention_id within this sentence mention_id += 1
random.seed(42) # Load the training set training_texts, training_spans = load_dataset(args.train_dir) # Use the NLTK tokenizer tokenizer = WhitespaceTokenizer() # Get the tokenized posts and their labels tokenized_posts, gold_labels = [], [] # Tokenize the texts and get the corresponding labels for text, span in zip(training_texts, training_spans): # Tokenize post tokens_offsets = tokenizer.span_tokenize(text) gold_offset_chars = set(span) # Determine label for each token tokenized_post, post_labels = [], [] for i, j in tokens_offsets: # check if this token label is toxic toxic_label = 0 for k in range(i, j): if k in gold_offset_chars: toxic_label = 1 break # remove punctuation from the token tokenized_post.append(text[i:j].translate( str.maketrans('', '', string.punctuation)))
class nltk_tokenizer(IncrementalTransform): """ a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers """ tagger_id = "nltk_tokenizer" def __init__(self, config): self.config = config self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() # PunktWordTokenizer() def _sentences(self, clean_visible): "generate strings identified as sentences" previous_end = 0 clean_visible = clean_visible.decode("utf8") assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): "make a sortedcollection on body.labels" labels = stream_item.body.labels.get(self.config.get("annotator_id")) if not labels: labels = [] self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): "assemble Sentence and Token objects" self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode("utf8") except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str)) tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info("overlapping label: %r" % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences
#outf.write("---BOS---\n") #Beginning of sentence sentence_re = r'''(?x) # set flag to allow verbose regexps (?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens ''' #Remove punctuation '''table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) sout=s.translate(table)''' tokenizer = WhitespaceTokenizer() tokenwords = WhitespaceTokenizer().tokenize(s) t_spans = tokenizer.span_tokenize(s) t_spans_l=[] #tokenwords2=word_tokenize(s) word_spans_=[] for w in t_spans: t_spans_l.append(w) #print(t_spans_l) k=0 for t in tokenwords: #index=tokenwords.index(t) #print(t[len(t)-1]) w=t_spans_l[k]
class ACEParser: def __init__(self): self.sent_tokenizer = PunktSentenceTokenizer() # self.word_tokenizer = RegexpTokenizer('\w+|\S+') self.word_tokenizer = WhitespaceTokenizer() self.root = None self.sentence_offsets = [] self.df = pd.DataFrame( columns=["doc_id", "sentence", "tokens", "events", "entities"]) def get_text(self, sgm_file): with open(sgm_file, "r", encoding="utf-8") as f: text = f.read() # Gets rid of lines with only tags text = re.sub(r"<(.|\s|\n)*?>", r"", text) sentence_offsets = list(self.sent_tokenizer.span_tokenize(text)) sentences = [] for offset in sentence_offsets: sentence_text = text[offset[0]:offset[1]] sentences.append(sentence_text) self.sentence_offsets = sentence_offsets return text def create_tree(self, apf_file): with open(apf_file, "r", encoding="utf-8") as f: xml_text = f.read() root = etree.fromstring(xml_text) self.root = root def get_extents(self): extent_nodes = self.root.xpath("//extent/charseq") return [ self.get_offset_tuple(extent_node) for extent_node in extent_nodes ] def get_offset_tuple(self, extent_node): return (int(extent_node.get("START")), int(extent_node.get("END")) + 1 ) # +1 makes them exclusive def get_sentences(self): sentences = [] for offset in self.sentence_offsets: sentence_text = text[offset[0]:offset[1]] sentences.append(sentence_text) return sentences def find_sentence_index(self, offset): for i, sent_offset in enumerate(self.sentence_offsets): if offset[0] >= sent_offset[0] and offset[1] <= sent_offset[1]: return i def offset_to_token(self, start, end, token_offsets, normalize=0): # normalize is making start and end relatable to token_offsets start -= normalize end -= normalize # TODO: change this to if end == offset[1]. In the case that end < offset[1] use startswith and extend token_offsets list for i, offset in enumerate(token_offsets): if end <= offset[1]: for j in range(i, -1, -1): if start >= token_offsets[j][0]: return j, i + 1 # Make it exclusive raise Exception( "Error while converting offset to token indexes. Start offset : %d , End offset : %d Norm : %d, Token offsets : %s" % (start, end, normalize, str(token_offsets))) def create_json_output(self, doc_text, filename): # doc_id = self.root.xpath("document")[0].get("DOCID") doc_id = filename event_nodes = self.root.xpath("//event") # TODO: We lose coreference information doing it this way. For now it is ok, but need to accomodate the other way too !!! event_mentions = [] for event_node in event_nodes: event_type = event_node.get("TYPE") event_subtype = event_node.get("SUBTYPE") event_id = event_node.get("ID") event_mention_nodes = event_node.xpath("event_mention") for mention_node in event_mention_nodes: # You actually don't need these two for finding which sentence we are talking about. # Because we already made sure that all of our extents are covered by sentence offsets. # extent_node = mention.xpath("/extent/charseq")[0] # extent = get_offset_tuple(extent_node) trigger_offset = self.get_offset_tuple( mention_node.xpath("anchor/charseq")[0]) # find which sentence this belongs. Only need to do this once. sent_idx = self.find_sentence_index(trigger_offset) event_arguments = [] arguments = mention_node.xpath("event_mention_argument") for argument in arguments: arg_role = argument.get("ROLE") arg_offset = self.get_offset_tuple( argument.xpath("extent/charseq")[0]) # TODO: NEED TO ADD ENTITY TYPES, getting them from refids !!! event_arguments.append({ "role": arg_role, "start": arg_offset[0], "end": arg_offset[1] }) event_mentions.append({ "event_id": event_id, "event_type": event_type, "event_subtype": event_subtype, "trigger": { "start": trigger_offset[0], "end": trigger_offset[1] }, "arguments": event_arguments, "sent_idx": sent_idx }) # For printing later # old_event_mentions = copy.deepcopy(event_mentions) tokens_list_for_printing = [] for i, sentence_offset in enumerate(self.sentence_offsets): sentence_text = doc_text[sentence_offset[0]:sentence_offset[1]] token_offsets = list( self.word_tokenizer.span_tokenize(sentence_text)) tokens = [ sentence_text[offset[0]:offset[1]] for offset in token_offsets ] tokens_list_for_printing.append(tokens) entity_mentions = [] curr_event_mentions = [] for j in range(len(event_mentions)): mention = event_mentions[j] if mention["sent_idx"] == i: # ipdb.set_trace() start_idx, end_idx = self.offset_to_token( mention["trigger"]["start"], mention["trigger"]["end"], token_offsets, normalize=sentence_offset[0]) event_mentions[j]["trigger"]["start"] = start_idx event_mentions[j]["trigger"]["end"] = end_idx for k, argument in enumerate(mention["arguments"]): start_idx, end_idx = self.offset_to_token( argument["start"], argument["end"], token_offsets, normalize=sentence_offset[0]) event_mentions[j]["arguments"][k]["start"] = start_idx event_mentions[j]["arguments"][k]["end"] = end_idx curr_event_mentions.append(event_mentions[j]) self.df = self.df.append( { "doc_id": doc_id, "sentence": sentence_text, "tokens": tokens, "events": curr_event_mentions, "entities": entity_mentions }, ignore_index=True) # Printing stuff # for mention, old_mention in zip(event_mentions, old_event_mentions): # tokens = tokens_list_for_printing[mention["sent_idx"]] # print("Offset version trigger : %s , Tokens version trigger : %s" %(doc_text[old_mention["trigger"]["start"]:old_mention["trigger"]["end"]], tokens[mention["trigger"]["start"]:mention["trigger"]["end"]])) # for argument, old_argument in zip(mention["arguments"], old_mention["arguments"]): # print("Offset version argument : %s , Tokens version argument : %s" %(doc_text[old_argument["start"]:old_argument["end"]], tokens[argument["start"]:argument["end"]])) # print("===========") # TODO: Remove debug stuff def fix_offsets(self, extents): offsets = self.sentence_offsets assert (len(offsets) > 1) # print(offsets) # print("*************") after_count = 0 before_count = 0 for extent in extents: # Check stuff for printing if len([ offset for offset in offsets if extent[0] >= offset[0] and extent[1] <= offset[1] ]) == 0: before_count += 1 if extent[1] <= offsets[0][1]: continue for idx in range(1, len(offsets)): offset = offsets[idx] if extent[1] <= offset[1]: # Ends before this sentence. if extent[0] < offset[0]: # Starts before this sentence # Fixing # print("-------") # print(extent) # print(offsets) for j in range( idx - 1, -1, -1 ): # For all sentences' offsets before this offset del offsets[j + 1] if extent[0] >= offsets[j][0]: offsets[j] = (offsets[j][0], offset[1]) break # print(offsets) break else: # Nothing wrong with this extent break # Check stuff for printing if len([ offset for offset in offsets if extent[0] >= offset[0] and extent[1] <= offset[1] ]) == 0: ipdb.set_trace() # MISSES some due to spaces between sentences # print(extent) # print(text[extent[0]:extent[1]]) after_count += 1 # print("Before : %d -> After : %d" %(before_count, after_count)) # print("================================================================================================================") self.sentence_offsets = offsets
import pymysql from nltk.tokenize import WhitespaceTokenizer connection = pymysql.connect(host="127.0.0.1", user="******", password="******", charset='utf8', db='tf-idf', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() terms = ['debut', 'two', 'language', 'also'] tokenizer = WhitespaceTokenizer() sql = 'SELECT * FROM wiki' cursor.execute(sql) for record in cursor.fetchall(): doc_id = record['id'] text = record['text'] for term in terms: for start, end in tokenizer.span_tokenize(text): if text[start:end].lower() == term: insert_sql = 'INSERT INTO inverted_index VALUES (%s, %s)' cursor.execute(insert_sql, (term, doc_id)) break connection.commit() connection.close()
class InvertedIndex: ''' Main Inverted-Index structure''' def __init__(self): self._tokenizer = WhitespaceTokenizer() self._index_cache = IndexCache() self._stop_words = set(stopwords.words('english')) self._stemmer = SnowballStemmer("english") self._max_documents_per_shard = 50000 self._num_documents_in_current_shard = 0 if os.path.isfile("index_data/index.meta"): self._num_documents_in_current_shard = pickle.load( open("index_data/index.meta")) def search(self, query): combined_results = None ret_results = None for i in range(0, len(query), 2): op = query[i] keyword = self._stemmer.stem(query[i + 1].strip( string.punctuation)) keyword_results = self._search_keyword(keyword) if combined_results: if op == "AND": combined_results = combined_results.intersection( set(keyword_results.keys())) elif op == "OR": combined_results = combined_results.union( set(keyword_results.keys())) else: return {"status": False, "message": "Malformed query"} for doc in ret_results.keys(): if doc not in combined_results: del ret_results[doc] elif keyword_results.get(doc): ret_results[doc].union(keyword_results[doc]) for doc in keyword_results: if doc not in ret_results: ret_results[doc] = keyword_results[doc] else: combined_results = set(keyword_results.keys()) ret_results = keyword_results result_counts = dict() for el in ret_results: result_counts[el] = len(ret_results[el]) sorted_result_counts = sorted(result_counts.items(), key=operator.itemgetter(1), reverse=True) sorted_results = [] for key, _ in sorted_result_counts: sorted_results.append({"key": key, "positions": ret_results[key]}) if len(sorted_results) > 0: ret = {"status": True, "results": sorted_results} else: ret = {"status": False, "message": "No hits"} return ret def _search_keyword(self, query): docs = self._index_cache.get(query) if not docs: return dict() return docs def add(self, key, text): self._num_documents_in_current_shard += 1 if self._num_documents_in_current_shard > self._max_documents_per_shard: self._num_documents_in_current_shard = 0 self._index_cache.create_new_shard() token_positions = self._tokenizer.span_tokenize(text) for pos in token_positions: start_pos = pos[0] end_pos = pos[1] token = text[start_pos:end_pos].lower() if token in self._stop_words: continue token = token.strip(string.punctuation) token = self._stemmer.stem(token) if len(token) > 0: self._index_cache.add(token, key, (start_pos, end_pos)) def delete(self, key, text): pass def save(self): pickle.dump(self._num_documents_in_current_shard, open("index_data/index.meta", "wb")) self._index_cache.flush()
class nltk_tokenizer(IncrementalTransform): ''' a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers ''' config_name = 'nltk_tokenizer' tagger_id = 'nltk_tokenizer' def __init__(self, *args, **kwargs): super(nltk_tokenizer, self).__init__(*args, **kwargs) self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer() def _sentences(self, clean_visible): 'generate strings identified as sentences' previous_end = 0 clean_visible = clean_visible.decode('utf8') assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): 'make a sortedcollection on body.labels' labels = stream_item.body.labels.get(self.config.get('annotator_id')) if not labels: labels = [] self.label_index = SortedCollection( labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): token_str = sent_str[start:end].encode('utf8') tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length = end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences def process_item(self, stream_item, context=None): if not hasattr(stream_item.body, 'clean_visible') or not stream_item.body.clean_visible: return stream_item self.label_index = None self.label_to_mention_id = dict() stream_item.body.sentences[self.tagger_id] = self.make_sentences(stream_item) return stream_item def __call__(self, stream_item, context=None): ## support the legacy callable API return self.process_item(stream_item, context)