def get_property_types( query: str, property_key: str, label_key: str ) -> List[Tuple]: """ Returns a dictionary of datatype - property pairs """ properties = [] results = paged_query(query) num_results = len(results) for index, result in enumerate(results): uri = result[property_key]['value'] label = result[label_key]['value'] doc = nlp(label) lemmas = [token.lemma_ for token in doc] poss = [token.pos_ for token in doc] log.debug( f'Analyzing property {index} of {num_results}, {label} - {uri}') # try: num_references = get_number_of_property_references(uri) properties.append((uri, label, num_references, lemmas, poss)) return properties
def __init__(self, text): self.original = str(text) self.docs = nlp(text) self.person_entities = self.getPersonEntities() # self.coreferences = [] # for r in self.docs._.coref_clusters : # coref = Coreference(r.main, r.mentions) # if self.isValid(coref) : # only take valid coreference # template = self.generateTemplate(coref) # self.templates, self.mutants, self.names, self.countries = self.generateMutant(coref, template) # break # self.coreferences = [] self.person_coreferences = [] self.person_coreferences = self.getPersonCoreferences() if len(self.person_coreferences) == 1: coref = self.person_coreferences[0] if self.isValid(coref): # print("XXXXXX") template = self.generateTemplate(coref) self.templates, self.mutants, self.names, self.countries = self.generateMutant( coref, template)
def NPs(caption): dataset = [] doc = nlp(caption) for word in doc: if word.dep_ in ('xcomp', 'ccomp', 'pcomp', 'acomp'): subtree_span = doc[word.left_edge.i:word.right_edge.i + 1] print('clause ancestors', [t.text for t in word.ancestors]) dataset.append(' '.join([t.text for t in subtree_span])) elif word.dep_ in ('ROOT'): left_subtree = [ doc[w.left_edge.i:w.i + 1] for w in word.lefts if w.dep_ != 'aux' ] right_subtree = [doc[w.i:w.right_edge.i + 1] for w in word.rights] for l in itertools.product(left_subtree, right_subtree): dataset.append(' '.join([l[0].text, word.text, l[1].text])) dataset.append(' '.join([word.text, l[1].text])) dataset.append(' '.join([l[0].text, word.text])) dataset.append(' '.join([t.text for t in l[0].subtree] + [word.text])) # note: this is a failed attempt to extract local prepositional phrases # e.g. 'the dog with a frisbee in his mouth' -> 'the dog with a frisbee' # elif word.pos_ in ('ADP') and word.dep_ != 'prt': # span = ([t.text for t in word.lefts] # + [word.text] + # [t.text for t in word.rights]) # dataset.append(' '.join([a.text for a in word.ancestors][:1] + span)) # dataset.append(' '.join([a.text for a in word.ancestors][:1] + # [a.text for a in word.subtree])) noun_chunks = [n.text for n in doc.noun_chunks if not n.root.is_stop] + \ [n.root.text for n in doc.noun_chunks if not n.root.is_stop] dataset = np.unique(dataset + [caption] + noun_chunks) # add original one if it's not already there return list(dataset) # + [caption] if caption not in dataset else dataset
def isContainGenderAssociatedWord(self): if not self.tokens: self.tokens = nlp(self.phrase) tokens = self.tokens for token in tokens: # print(token.text, token.pos_, token.dep_) if token.pos_ == "NOUN" and token.dep_ == "ROOT": if isInMasculineGenderAssosiatedWord(token.text): self.gender_associated_word = token.text return True if isInFeminineGenderAssosiatedWord(token.text): self.gender_associated_word = token.text return True return False
def isHasSalutation(self): if not self.tokens: self.tokens = nlp(self.phrase) tokens = self.tokens for token in tokens: if isInMasculineSalutation(token.text): self.salutation = token.text self.gender = "male" return True for token in tokens: if isInFeminineSalutation(token.text): self.salutation = token.text self.gender = "female" return True return False
def __init__(self, text): self.original = str(text) self.docs = nlp(text) self.person_entities = self.getPersonEntities() # self.coreferences = [] self.person_coreferences = [] self.person_coreferences = self.getPersonCoreferences() if len(self.person_coreferences) == 1: coref = self.person_coreferences[0] if self.isValid(coref): template = self.generateTemplate(coref) self.templates, self.mutants, self.genders = self.generateMutant( coref, template)
if "--model" in sys.argv: model_arg = sys.argv.index("--model") model_file = sys.argv[model_arg + 1] if "--features" in sys.argv: features_arg = sys.argv.index("--features") features_file = sys.argv[features_arg + 1] clf = load(model_file) features_obj = load(features_file) feature_set, feature_hasher = features_obj if train.check_file(input_file): print "file is in wrong format. expected raw and not proccessed file" data = {} for sen_id, sen in utils.read_lines(sys.argv[1]): data[sen_id] = utils.nlp(sen) lexicon_helper = Lexicon_helper() extracted_ent_paris_svm = [] feature_extractor = FeatureExtractor(lexicon_helper, feature_hasher, feature_set) sen_entities_with_x = spacy_parser.get_x_data(feature_extractor, data) sen_entities_with_x = sorted(sen_entities_with_x, key=utils.get_senid_int) allx = np.array([x[3].toarray()[0] for x in sen_entities_with_x]) predicted_entities_pairs = clf.predict(allx) extracted_ent_paris_svm = filter_ent_pairs(predicted_entities_pairs, sen_entities_with_x) #Rules extraction extracted_ents_rules = rules_extractor.predict(data, lexicon_helper) extracted_ents_rules = sorted(extracted_ents_rules,
def getTokens(self): if not self.tokens: self.tokens = nlp(self.phrase) return self.tokens