def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) self.bacsu = readBacsu(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/bacsu-modified.txt")) self.subti = readSubtiwiki(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/Subtiwiki-Synonyms.csv")) #self.subti = readSubtiwiki(os.path.expanduser("~/cvs_checkout/JariSandbox/Wiki/subtiwiki/Subtiwiki-Synonyms.csv")) # OR the dictionaries self.any = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.any[key] = set() if self.bacsu.has_key(key): for value in self.bacsu[key]: self.any[key].add(value) if self.subti.has_key(key): for value in self.subti[key]: self.any[key].add(value) self.any[key] = list(self.any[key]) self.any[key].sort() # AND the dictionaries self.all = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.all[key] = set() allSynonyms = set() bacsuSet = set() if self.bacsu.has_key(key): bacsuSet = self.bacsu[key] for x in bacsuSet: allSynonyms.add(x) subtiSet = set() if self.subti.has_key(key): subtiSet = self.subti[key] for x in subtiSet: allSynonyms.add(x) for synonym in allSynonyms: if synonym in bacsuSet and synonym in subtiSet: self.all[key].add(synonym) self.all[key] = list(self.all[key]) self.all[key].sort()
def __init__(self, featureSet=None): FeatureBuilder.__init__(self, featureSet) drugBankFile = "/home/jari/data/DDIExtraction2011/resources/drugbank.xml" # Load drug data into memory on first call to constructor if DrugFeatureBuilder.data == None: DrugFeatureBuilder.data, DrugFeatureBuilder.nameToId = prepareDrugBank(drugBankFile) DrugFeatureBuilder.interactionPairs = buildInteractionPairs(DrugFeatureBuilder.data)
def __init__(self, featureSet=None): FeatureBuilder.__init__(self, featureSet) if not hasattr(Settings, "DRUG_BANK_XML"): print >> sys.stderr, "Drug Bank XML not installed, installing now" installDrugBank(updateLocalSettings=True) drugBankFile = Settings.DRUG_BANK_XML #drugBankFile = "/home/jari/data/DDIExtraction2011/resources/drugbank.xml" # Load drug data into memory on first call to constructor if DrugFeatureBuilder.data == None: DrugFeatureBuilder.data, DrugFeatureBuilder.nameToId = prepareDrugBank( drugBankFile) DrugFeatureBuilder.tokenToId = {} for name in DrugFeatureBuilder.nameToId: splits = name.split() if len(splits) < 2: continue for split in splits: if split not in DrugFeatureBuilder.tokenToId: DrugFeatureBuilder.tokenToId[split] = [] DrugFeatureBuilder.tokenToId[split].extend( DrugFeatureBuilder.nameToId[name]) for token in DrugFeatureBuilder.tokenToId: DrugFeatureBuilder.tokenToId[token] = sorted( list(set(DrugFeatureBuilder.tokenToId[token]))) DrugFeatureBuilder.interactionPairs = buildInteractionPairs( DrugFeatureBuilder.data)
def __init__(self, featureSet, style=None): FeatureBuilder.__init__(self, featureSet, style) if "wordvector" in style and isinstance(style["wordvector"], basestring): wordVectorPath = style["wordvector"] else: wordVectorPath = Settings.W2VFILE print >> sys.stderr, "Loading word vectors from", wordVectorPath self.model = WV.load(wordVectorPath, 100000, 10000000) #10000, 500000)
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) self.terms = {} self.byName = {} self.byKeyword = {} self.loadOBO( os.path.join(os.path.dirname(os.path.abspath(__file__)), "OntoBiotope_BioNLP-ST-2016.obo"))
def __init__(self, featureSet): """ This is called, when the ExampleBuilder object is created. @type featureSet: Core.IdSet @param featureSet: The feature ids """ FeatureBuilder.__init__(self, featureSet)
def __init__(self, featureSet): """ This is called, when the ExampleBuilder object is created. @type featureSet: Core.IdSet @param featureSet: The feature ids """ FeatureBuilder.__init__(self, featureSet)
def __init__(self, featureSet, style=None): """ @type featureSet: IdSet @param featureSet: feature ids """ FeatureBuilder.__init__(self, featureSet, style=style) #self.edgeFeatureBuilder = EdgeFeatureBuilder(featureSet) self.ontologyFeatureBuilder = None self.noAnnType = False self.predictedRange = None
def __init__(self, featureSet=None): FeatureBuilder.__init__(self, featureSet) if not hasattr(Settings, "DRUG_BANK_XML"): print >> sys.stderr, "Drug Bank XML not installed, installing now" installDrugBank(updateLocalSettings=True) drugBankFile = Settings.DRUG_BANK_XML #drugBankFile = "/home/jari/data/DDIExtraction2011/resources/drugbank.xml" # Load drug data into memory on first call to constructor if DrugFeatureBuilder.data == None: DrugFeatureBuilder.data, DrugFeatureBuilder.nameToId = prepareDrugBank(drugBankFile) DrugFeatureBuilder.interactionPairs = buildInteractionPairs(DrugFeatureBuilder.data)
def predict(nlp, cls, file_path_txt, out_file_path): fb = FeatureBuilder(nlp) features_matrix_str = fb.get_features_of_file(file_path_txt) pred_labels = cls.predict(features_matrix_str) out_file = open(out_file_path, 'w') for (idxs, features_list), label in zip(features_matrix_str, pred_labels): if label == 1: sent_num, obj1, obj2 = idxs sent_num = 'sent' + str(sent_num) obj1, obj2 = str(obj1), str(obj2) out_file.write(sent_num + '\t' + obj1 + '\t' + 'Live_In' + '\t' + obj2 + '\t\n') out_file.close()
def train_classifier(nlp, train_txt_file, train_annotation_file): fb = FeatureBuilder(nlp) features_matrix = fb.get_features_of_file(train_txt_file) annotation_dict, r2i = annotation_to_dict(train_annotation_file) lc = LabelChecker(annotation_dict, r2i) cls = MyClassifier(fb.features_to_index) gold_labels = lc.get_labels_of(features_matrix) cls.train_on(features_matrix, gold_labels) pred_labels = cls.predict(features_matrix) acc_all = accuracy_score(gold_labels, pred_labels) acc_filtered = accuracy_of(gold_labels, pred_labels) print 'train - accuracy all %0.2f%%' % (acc_all * 100.0) print 'train - accuracy filtered %0.2f%%' % (acc_filtered * 100.0) return cls
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) #self.bacsu = readBacsu(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/bacsu-modified.txt")) #self.subti = readSubtiwiki(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/Subtiwiki-Synonyms.csv")) #self.subti = readSubtiwiki(os.path.expanduser("~/cvs_checkout/JariSandbox/Wiki/subtiwiki/Subtiwiki-Synonyms.csv")) if not hasattr(Settings, "TEES_RESOURCES"): print >> sys.stderr, "TEES example builder data files not installed, installing now" installRENData(updateLocalSettings=True) self.bacsu = readBacsu( os.path.join(Settings.TEES_RESOURCES, "bacsu-modified.txt")) self.subti = readSubtiwiki( os.path.join(Settings.TEES_RESOURCES, "Subtiwiki-Synonyms.csv")) # OR the dictionaries self.any = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.any[key] = set() if self.bacsu.has_key(key): for value in self.bacsu[key]: self.any[key].add(value) if self.subti.has_key(key): for value in self.subti[key]: self.any[key].add(value) self.any[key] = list(self.any[key]) self.any[key].sort() # AND the dictionaries self.all = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.all[key] = set() allSynonyms = set() bacsuSet = set() if self.bacsu.has_key(key): bacsuSet = self.bacsu[key] for x in bacsuSet: allSynonyms.add(x) subtiSet = set() if self.subti.has_key(key): subtiSet = self.subti[key] for x in subtiSet: allSynonyms.add(x) for synonym in allSynonyms: if synonym in bacsuSet and synonym in subtiSet: self.all[key].add(synonym) self.all[key] = list(self.all[key]) self.all[key].sort()
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) self.bacsu = readBacsu( os.path.expanduser( "~/data/BioNLP11SharedTask/supporting-tasks/bacsu-modified.txt" )) self.subti = readSubtiwiki( os.path.expanduser( "~/data/BioNLP11SharedTask/supporting-tasks/Subtiwiki-Synonyms.csv" )) #self.subti = readSubtiwiki(os.path.expanduser("~/cvs_checkout/JariSandbox/Wiki/subtiwiki/Subtiwiki-Synonyms.csv")) # OR the dictionaries self.any = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.any[key] = set() if self.bacsu.has_key(key): for value in self.bacsu[key]: self.any[key].add(value) if self.subti.has_key(key): for value in self.subti[key]: self.any[key].add(value) self.any[key] = list(self.any[key]) self.any[key].sort() # AND the dictionaries self.all = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.all[key] = set() allSynonyms = set() bacsuSet = set() if self.bacsu.has_key(key): bacsuSet = self.bacsu[key] for x in bacsuSet: allSynonyms.add(x) subtiSet = set() if self.subti.has_key(key): subtiSet = self.subti[key] for x in subtiSet: allSynonyms.add(x) for synonym in allSynonyms: if synonym in bacsuSet and synonym in subtiSet: self.all[key].add(synonym) self.all[key] = list(self.all[key]) self.all[key].sort()
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) #self.bacsu = readBacsu(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/bacsu-modified.txt")) #self.subti = readSubtiwiki(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/Subtiwiki-Synonyms.csv")) #self.subti = readSubtiwiki(os.path.expanduser("~/cvs_checkout/JariSandbox/Wiki/subtiwiki/Subtiwiki-Synonyms.csv")) if not hasattr(Settings, "TEES_RESOURCES"): print >> sys.stderr, "TEES example builder data files not installed, installing now" installRENData(updateLocalSettings=True) self.bacsu = readBacsu(os.path.join(Settings.TEES_RESOURCES, "bacsu-modified.txt")) self.subti = readSubtiwiki(os.path.join(Settings.TEES_RESOURCES, "Subtiwiki-Synonyms.csv")) # OR the dictionaries self.any = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.any[key] = set() if self.bacsu.has_key(key): for value in self.bacsu[key]: self.any[key].add(value) if self.subti.has_key(key): for value in self.subti[key]: self.any[key].add(value) self.any[key] = list(self.any[key]) self.any[key].sort() # AND the dictionaries self.all = {} for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): self.all[key] = set() allSynonyms = set() bacsuSet = set() if self.bacsu.has_key(key): bacsuSet = self.bacsu[key] for x in bacsuSet: allSynonyms.add(x) subtiSet = set() if self.subti.has_key(key): subtiSet = self.subti[key] for x in subtiSet: allSynonyms.add(x) for synonym in allSynonyms: if synonym in bacsuSet and synonym in subtiSet: self.all[key].add(synonym) self.all[key] = list(self.all[key]) self.all[key].sort()
def __init__(self, featureSet=None): FeatureBuilder.__init__(self, featureSet) if not hasattr(Settings, "DRUG_BANK_XML"): print >> sys.stderr, "Drug Bank XML not installed, installing now" installDrugBank(updateLocalSettings=True) drugBankFile = Settings.DRUG_BANK_XML #drugBankFile = "/home/jari/data/DDIExtraction2011/resources/drugbank.xml" # Load drug data into memory on first call to constructor if DrugFeatureBuilder.data == None: DrugFeatureBuilder.data, DrugFeatureBuilder.nameToId = prepareDrugBank(drugBankFile) DrugFeatureBuilder.tokenToId = {} for name in DrugFeatureBuilder.nameToId: splits = name.split() if len(splits) < 2: continue for split in splits: if split not in DrugFeatureBuilder.tokenToId: DrugFeatureBuilder.tokenToId[split] = [] DrugFeatureBuilder.tokenToId[split].extend(DrugFeatureBuilder.nameToId[name]) for token in DrugFeatureBuilder.tokenToId: DrugFeatureBuilder.tokenToId[token] = sorted(list(set(DrugFeatureBuilder.tokenToId[token]))) DrugFeatureBuilder.interactionPairs = buildInteractionPairs(DrugFeatureBuilder.data)
def __init__(self, featureSet): global g_bioInferFileName FeatureBuilder.__init__(self, featureSet) self.ontologies = loadOntologies(g_bioInferFileName)
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) self.noAnnType = False self.edgeTypesForFeatures = [] self.useNonNameEntities = False
def __init__(self, featureSet=None): FeatureBuilder.__init__(self, featureSet) from nltk.corpus import wordnet self.wordnet = wordnet print >> sys.stderr, "Using WordNet via NLTK"
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) self.generator = random.Random(0)
from FeatureBuilder import FeatureBuilder # fb = FeatureBuilder(csv_file_name='~/sdb1/ais/ais_data.csv') fb = FeatureBuilder( csv_file_name= '~/sdb1/ais/data/frequencyOfEdgesInData_ais201710_compact_version.csv') fb.run() print fb.get_new_feature_df()
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet) self.terms = {} self.byName = {} self.byKeyword = {} self.loadOBO(os.path.join(os.path.dirname(os.path.abspath(__file__)), "OntoBiotope_BioNLP-ST-2016.obo"))
def __init__(self, featureSet): global g_bioInferFileName FeatureBuilder.__init__(self, featureSet) self.ontologies = loadOntologies(g_bioInferFileName)
def __init__(self, featureSet): FeatureBuilder.__init__(self, featureSet)
from Classifier import Classifier from FeatureBuilder import FeatureBuilder features = FeatureBuilder() features.load_model() company_classifier_path = './models/one_vs_rest_company' location_classifier_path = './models/one_vs_rest_location' goods_classifier_path = './models/one_vs_rest_goods' company_X_train, company_y_train, company_X_test, company_y_test = features.one_vs_rest_generator( 0) location_X_train, location_y_train, location_X_test, location_y_test = features.one_vs_rest_generator( 1) goods_X_train, goods_y_train, goods_X_test, goods_y_test = features.one_vs_rest_generator( 2) classifier = Classifier(features.company_feature_encoder, features.location_feature_encoder, features.goods_feature_encoder) classifier.tpot_classifiers(company_X_train, company_y_train, company_X_test, company_y_test, company_classifier_path) classifier.tpot_classifiers(location_X_train, location_y_train, location_X_test, location_y_test, location_classifier_path) classifier.tpot_classifiers(goods_X_train, goods_y_train, goods_X_test, goods_y_test, goods_classifier_path)
def build_feature_builders(self): self.feature_list = [] for feature_opt in self.config["feature"]: builder = FeatureBuilder(feature_opt, self.config, self.dataloader) self.feature_list.append(builder)
def __init__(self, featureSet=None): FeatureBuilder.__init__(self, featureSet) from nltk.corpus import wordnet self.wordnet = wordnet print >> sys.stderr, "Using WordNet via NLTK"
from Classifier import Classifier from FeatureBuilder import FeatureBuilder import json from RandomStringClassifier import RandomStringClassifier from DateClassifier import DateClassifier import operator features = FeatureBuilder() features.load_model() classifier_3types = Classifier(features.company_feature_encoder, features.location_feature_encoder, features.goods_feature_encoder) classifier_3types.load_classifiers() dateClassifier = DateClassifier() randomClassifier = RandomStringClassifier() fname = 'test5.json' with open('./data/' + fname) as f: data = json.load(f) output = {} confidence_thresh = 0.5 for cats in data['recognitionResult']['lines']: for word in cats['words']: text = word['text'].lower() clean_text = ''.join(c for c in text if c.isalnum()) is_date = dateClassifier.classify(text)
def __init__(self, featureSet, style=None): FeatureBuilder.__init__(self, featureSet, style) self.model = WV.load(Settings.W2VFILE, 100000, 10000000) #10000, 500000)
def initialize(self, dataPath): FeatureBuilder.initialize(self, dataPath) self.dataset = getYelpDataset(dataPath)