Beispiel #1
0
 def lemma_query(self, query, sent_level=False):
     spacy_string = TextProcessing().nlp[self.lang](query)
     lemma_list = [token.lemma_ for token in spacy_string]
     lemma_string = ' '.join(lemma_list)
     with self.ix_lemma.searcher() as searcher:
         query = None
         if self.lang == "ja":
             query = QueryParser("content", self.ix_lemma.schema).parse(
                 '%s' % lemma_string)
         else:
             query = QueryParser("content", self.ix_lemma.schema).parse(
                 '"%s"' % lemma_string)
         results = searcher.search(query, limit=None)
         raw_res = [self.utterances[int(res['path'])] for res in results]
     if not sent_level:
         return raw_res
     else:
         # finding the sentence that contains the query
         new_res = []
         for utt in raw_res:
             for sent_num, sent in enumerate(utt.spacy.sents):
                 # creating the lemma version of sentence
                 lemma_text = ' '.join([x.lemma_ for x in sent])
                 if lemma_string in lemma_text:
                     new_res.append(
                         Utterance(sent.text,
                                   utt.id,
                                   sent_num,
                                   lang=self.lang))
         return new_res
Beispiel #2
0
 def query(self, query, sent_level=False):
     with self.ix.searcher() as searcher:
         query_p = None
         if self.lang == "ja":
             query_p = QueryParser("content",
                                   self.ix.schema).parse('%s' % query)
         else:
             query_p = QueryParser("content",
                                   self.ix.schema).parse('"%s"' % query)
         results = searcher.search(query_p, limit=None)
         raw_res = [self.utterances[int(res['path'])] for res in results]
     if not sent_level:
         return raw_res
     else:
         # finding the sentence that contains the query
         new_res = []
         for utt in raw_res:
             for sent_num, sent in enumerate(utt.spacy.sents):
                 if query in sent.text:
                     new_res.append(
                         Utterance(sent.text,
                                   utt.id,
                                   sent_num,
                                   lang=self.lang))
         return new_res
Beispiel #3
0
def reconstruct_frame(corpus, schema):
    frame = Frame(schema['frame_name'])
    pos_set = set()
    neg_set = set()
    if type(schema['positive_set']) is list:
        positive_list = schema['positive_set']
        negative_list = schema['negative_set']
    else:
        tree = ET.parse(schema['positive_set'])
        root = tree.getroot()
        for sent in root:
            if sent.tag == "positive":
                pos_set.add(Utterance(sent.text, None))
            elif sent.tag == "negative":
                neg_set.add(Utterance(sent.text, None))
    frame.addExamples(pos_set)
    frame.trainModel(corpus, scale_to=schema['scale_to'], epochs=schema['epochs'], batch_size=schema['batch_size'],
                     reg_param=schema['reg_param'], neg_set=neg_set)
    return frame
Beispiel #4
0
 def analyze(self, sent):
     sent = Utterance(sent, -1)
     labels = self.parse(sent)
     return {
         'text':
         sent.text,
         'frames': [label.get_state() for label in labels],
         'dep':
         displacy.render(sent.spacy, style='dep', options={'offset_x': 5})
     }
Beispiel #5
0
def train_lambda_attribute(corpus, file):
    dummy = Utterance('silly legacy code that needs at least this many examples to run', None)
    with open(file, 'r') as infile:
        schema = json.load(infile)
    attribute = FrameAttribute(schema['name'], schema['linguistic_info'], schema['unique'])
    namespace = {}
    model_text = schema['func']
    name = schema['func_name']
    print('Training ', schema['name'])
    attribute.addExamples(dummy.spacy)
    attribute.trainModel(corpus, type_="lambda_rules", func=model_text, func_name=name)
    return attribute
Beispiel #6
0
def train_ml_attribute(corpus, file):
    with open(file, 'r') as infile:
        schema = json.load(infile)
    attribute = FrameAttribute(schema['name'], schema['linguistic_info'], schema['unique'])
    reconstructed_examples = set()
    for e in schema['examples']:
        doc = Utterance(e[0], None).spacy
        reconstructed_examples.add(doc[e[1]])
    attribute.addExamples(reconstructed_examples)
    print('Training ', schema['name'])
    attribute.trainModel(corpus, "nocontext")
    return attribute
Beispiel #7
0
 def find_nearest_n(self, query_str, n, subset=None):
     query = Utterance(query_str, 999999, lang=self.lang).spacy.vector
     utt_set = list(subset) if subset else self.utterances
     distances = np.zeros(len(utt_set))
     for i, utterance in enumerate(utt_set):
         tmp = cosine(query, utterance.spacy.vector)
         if math.isnan(tmp) or tmp > 1:
             distances[i] = 100
         else:
             distances[i] = tmp
     top_indexes = distances.argsort()
     nearest_utts = [utt_set[j] for j in top_indexes][:n]
     nearest_dists = distances[top_indexes][:n]
     return nearest_utts, nearest_dists
Beispiel #8
0
def load_frame_pos_set(filename):
    with open(filename, 'r') as infile:
        loaded = json.load(infile)['positive_set']
        print(loaded)
        positive_list = list()
        if type(loaded) is list:
            positive_list = loaded
        else:
            tree = ET.parse(loaded)
            root = tree.getroot()
            for sent in root:
                if sent.tag == "positive":
                    positive_list.append(sent.text)
    print('There are ' + str(len(positive_list)) + ' relevant messages in the corpus')
    positive_utterances = set()
    for item in positive_list:
        positive_utterances.add(Utterance(item, None))
    return positive_utterances
Beispiel #9
0
    def parse(self, sent):
        if isinstance(sent, str):
            sent = Utterance(sent, -1)
        elif isinstance(sent, Utterance):
            pass
        else:
            print("Argument to parse must be a string or Sentence object")
            return None
        labels = []
        # Parse for each frame
        for frame in self.frames:
            pred = frame.model.predict([sent])
            if pred[0][1] > 0.5:
                flabel = FrameLabel(frame, sent, pred[0][1])
                for attr in frame.attributes:
                    self._parse_attr(attr, sent, flabel)
                labels.append(flabel)
#        logging.debug('Found labels {0}'.format(labels))
        return labels
Beispiel #10
0
def swap_attributes(attr_examples, other_attr_examples):
    # building a doc to attribute dictionary
    doc2attrA = {}
    for attr in attr_examples:
        assert isinstance(attr, Token)
        doc2attrA[attr.doc] = attr
    doc2attrB = {}
    for attr in other_attr_examples:
        assert isinstance(attr, Token)
        doc2attrB[attr.doc] = attr
    # checking the keys that match
    docs_with_both_attr = \
        set(doc2attrA.keys()).intersection(set(doc2attrB.keys()))
    # swapping the attributes in each doc and building new utterances
    new_utterances = []
    new_attr_examples = []
    new_other_attr_examples = []
    attrA_index = None
    attrB_index = None 
    for doc in docs_with_both_attr:
        new_sequence = []
        for i, token in enumerate(doc):
            if token == doc2attrA[doc]:
                attrA_index = i
                new_sequence.append(doc2attrB[doc].text)
            elif token == doc2attrB[doc]:
                attrB_index = i
                new_sequence.append(doc2attrA[doc].text)
            else:
                new_sequence.append(token.text)
        new_string = ' '.join(new_sequence)
        new_utt = Utterance(new_string, None)
        new_utterances.append(new_utt)
        new_attr_examples.append(new_utt.spacy[attrA_index])
        new_other_attr_examples.append(new_utt.spacy[attrB_index])
    return new_utterances, new_attr_examples, new_other_attr_examples
Beispiel #11
0
    def __init__(self,
                 input_,
                 limit=None,
                 build_index=False,
                 csv_path='',
                 lang='en'):
        # checking the type of input
        print("init Corpus")
        self.lang = lang
        if isinstance(input_, pd.DataFrame):
            if limit is None:
                self.data = input_
            else:
                self.data = input_.head(limit)
            path = csv_path
        elif isinstance(input_, str):
            if limit is None:
                self.data = pd.read_csv(input_)
            else:
                self.data = pd.read_csv(input_, nrows=limit)
            if csv_path:
                path = csv_path
            else:
                path = input_
        else:
            raise ValueError("The input to the corpus should be either a \
                             file name or a DataFrame.")
        # Step 1) loading the sentences
        all_text = self.data['text'].tolist()

        # Step 2) Load semafor results
        print("Parsing the Semafor data... ")
        semafor_file = os.path.dirname(path) + "/semaforData.json"
        framenet = None
        if os.path.isfile(semafor_file):
            with open(semafor_file, "rb") as f:
                framenet = f.readlines()
        else:
            warnings.warn('No FrameNet data found for the corpus.')

        # Step 3) Load deepsrl results
        print("Parsing the DeepSRL data... ")
        deepsrl_file = os.path.dirname(path) + "/deepsrlData.json"
        deepsrl = None
        if os.path.isfile(deepsrl_file):
            with open(deepsrl_file, "rb") as f:
                deepsrl = f.readlines()
        else:
            warnings.warn('No ProbBank data found for the corpus.')

        # Creating utterances from the loaded data
        self.utterances = []
        print("Creating Utterances...")
        time.sleep(0.3)  # to avoid prints in the middle of the progress bar
        index = 0
        for sent in log_progress(all_text):
            self.utterances.append(Utterance(sent, _id=index, lang=self.lang))
            utterance = self.utterances[index]
            utterance.frames = getFrames(framenet, index)
            utterance.propbank = getPropBank(deepsrl, index)
            index += 1
        time.sleep(0.3)  # to avoid prints in the middle of the progress bar

        # Building or loading indices
        self.prepare_indices(build_index, path)