def lemma_query(self, query, sent_level=False): spacy_string = TextProcessing().nlp[self.lang](query) lemma_list = [token.lemma_ for token in spacy_string] lemma_string = ' '.join(lemma_list) with self.ix_lemma.searcher() as searcher: query = None if self.lang == "ja": query = QueryParser("content", self.ix_lemma.schema).parse( '%s' % lemma_string) else: query = QueryParser("content", self.ix_lemma.schema).parse( '"%s"' % lemma_string) results = searcher.search(query, limit=None) raw_res = [self.utterances[int(res['path'])] for res in results] if not sent_level: return raw_res else: # finding the sentence that contains the query new_res = [] for utt in raw_res: for sent_num, sent in enumerate(utt.spacy.sents): # creating the lemma version of sentence lemma_text = ' '.join([x.lemma_ for x in sent]) if lemma_string in lemma_text: new_res.append( Utterance(sent.text, utt.id, sent_num, lang=self.lang)) return new_res
def query(self, query, sent_level=False): with self.ix.searcher() as searcher: query_p = None if self.lang == "ja": query_p = QueryParser("content", self.ix.schema).parse('%s' % query) else: query_p = QueryParser("content", self.ix.schema).parse('"%s"' % query) results = searcher.search(query_p, limit=None) raw_res = [self.utterances[int(res['path'])] for res in results] if not sent_level: return raw_res else: # finding the sentence that contains the query new_res = [] for utt in raw_res: for sent_num, sent in enumerate(utt.spacy.sents): if query in sent.text: new_res.append( Utterance(sent.text, utt.id, sent_num, lang=self.lang)) return new_res
def reconstruct_frame(corpus, schema): frame = Frame(schema['frame_name']) pos_set = set() neg_set = set() if type(schema['positive_set']) is list: positive_list = schema['positive_set'] negative_list = schema['negative_set'] else: tree = ET.parse(schema['positive_set']) root = tree.getroot() for sent in root: if sent.tag == "positive": pos_set.add(Utterance(sent.text, None)) elif sent.tag == "negative": neg_set.add(Utterance(sent.text, None)) frame.addExamples(pos_set) frame.trainModel(corpus, scale_to=schema['scale_to'], epochs=schema['epochs'], batch_size=schema['batch_size'], reg_param=schema['reg_param'], neg_set=neg_set) return frame
def analyze(self, sent): sent = Utterance(sent, -1) labels = self.parse(sent) return { 'text': sent.text, 'frames': [label.get_state() for label in labels], 'dep': displacy.render(sent.spacy, style='dep', options={'offset_x': 5}) }
def train_lambda_attribute(corpus, file): dummy = Utterance('silly legacy code that needs at least this many examples to run', None) with open(file, 'r') as infile: schema = json.load(infile) attribute = FrameAttribute(schema['name'], schema['linguistic_info'], schema['unique']) namespace = {} model_text = schema['func'] name = schema['func_name'] print('Training ', schema['name']) attribute.addExamples(dummy.spacy) attribute.trainModel(corpus, type_="lambda_rules", func=model_text, func_name=name) return attribute
def train_ml_attribute(corpus, file): with open(file, 'r') as infile: schema = json.load(infile) attribute = FrameAttribute(schema['name'], schema['linguistic_info'], schema['unique']) reconstructed_examples = set() for e in schema['examples']: doc = Utterance(e[0], None).spacy reconstructed_examples.add(doc[e[1]]) attribute.addExamples(reconstructed_examples) print('Training ', schema['name']) attribute.trainModel(corpus, "nocontext") return attribute
def find_nearest_n(self, query_str, n, subset=None): query = Utterance(query_str, 999999, lang=self.lang).spacy.vector utt_set = list(subset) if subset else self.utterances distances = np.zeros(len(utt_set)) for i, utterance in enumerate(utt_set): tmp = cosine(query, utterance.spacy.vector) if math.isnan(tmp) or tmp > 1: distances[i] = 100 else: distances[i] = tmp top_indexes = distances.argsort() nearest_utts = [utt_set[j] for j in top_indexes][:n] nearest_dists = distances[top_indexes][:n] return nearest_utts, nearest_dists
def load_frame_pos_set(filename): with open(filename, 'r') as infile: loaded = json.load(infile)['positive_set'] print(loaded) positive_list = list() if type(loaded) is list: positive_list = loaded else: tree = ET.parse(loaded) root = tree.getroot() for sent in root: if sent.tag == "positive": positive_list.append(sent.text) print('There are ' + str(len(positive_list)) + ' relevant messages in the corpus') positive_utterances = set() for item in positive_list: positive_utterances.add(Utterance(item, None)) return positive_utterances
def parse(self, sent): if isinstance(sent, str): sent = Utterance(sent, -1) elif isinstance(sent, Utterance): pass else: print("Argument to parse must be a string or Sentence object") return None labels = [] # Parse for each frame for frame in self.frames: pred = frame.model.predict([sent]) if pred[0][1] > 0.5: flabel = FrameLabel(frame, sent, pred[0][1]) for attr in frame.attributes: self._parse_attr(attr, sent, flabel) labels.append(flabel) # logging.debug('Found labels {0}'.format(labels)) return labels
def swap_attributes(attr_examples, other_attr_examples): # building a doc to attribute dictionary doc2attrA = {} for attr in attr_examples: assert isinstance(attr, Token) doc2attrA[attr.doc] = attr doc2attrB = {} for attr in other_attr_examples: assert isinstance(attr, Token) doc2attrB[attr.doc] = attr # checking the keys that match docs_with_both_attr = \ set(doc2attrA.keys()).intersection(set(doc2attrB.keys())) # swapping the attributes in each doc and building new utterances new_utterances = [] new_attr_examples = [] new_other_attr_examples = [] attrA_index = None attrB_index = None for doc in docs_with_both_attr: new_sequence = [] for i, token in enumerate(doc): if token == doc2attrA[doc]: attrA_index = i new_sequence.append(doc2attrB[doc].text) elif token == doc2attrB[doc]: attrB_index = i new_sequence.append(doc2attrA[doc].text) else: new_sequence.append(token.text) new_string = ' '.join(new_sequence) new_utt = Utterance(new_string, None) new_utterances.append(new_utt) new_attr_examples.append(new_utt.spacy[attrA_index]) new_other_attr_examples.append(new_utt.spacy[attrB_index]) return new_utterances, new_attr_examples, new_other_attr_examples
def __init__(self, input_, limit=None, build_index=False, csv_path='', lang='en'): # checking the type of input print("init Corpus") self.lang = lang if isinstance(input_, pd.DataFrame): if limit is None: self.data = input_ else: self.data = input_.head(limit) path = csv_path elif isinstance(input_, str): if limit is None: self.data = pd.read_csv(input_) else: self.data = pd.read_csv(input_, nrows=limit) if csv_path: path = csv_path else: path = input_ else: raise ValueError("The input to the corpus should be either a \ file name or a DataFrame.") # Step 1) loading the sentences all_text = self.data['text'].tolist() # Step 2) Load semafor results print("Parsing the Semafor data... ") semafor_file = os.path.dirname(path) + "/semaforData.json" framenet = None if os.path.isfile(semafor_file): with open(semafor_file, "rb") as f: framenet = f.readlines() else: warnings.warn('No FrameNet data found for the corpus.') # Step 3) Load deepsrl results print("Parsing the DeepSRL data... ") deepsrl_file = os.path.dirname(path) + "/deepsrlData.json" deepsrl = None if os.path.isfile(deepsrl_file): with open(deepsrl_file, "rb") as f: deepsrl = f.readlines() else: warnings.warn('No ProbBank data found for the corpus.') # Creating utterances from the loaded data self.utterances = [] print("Creating Utterances...") time.sleep(0.3) # to avoid prints in the middle of the progress bar index = 0 for sent in log_progress(all_text): self.utterances.append(Utterance(sent, _id=index, lang=self.lang)) utterance = self.utterances[index] utterance.frames = getFrames(framenet, index) utterance.propbank = getPropBank(deepsrl, index) index += 1 time.sleep(0.3) # to avoid prints in the middle of the progress bar # Building or loading indices self.prepare_indices(build_index, path)