def __init__(self, preprocessor, architecture_path=None, weights_path=None): ''' Optionally allow a path to a (kera's formatted) JSON model architecture specification and associated set of weights -- this allows easy loading and re-instantiation of trained models. ''' self.preprocessor = preprocessor self.nlp = spacy.load('en') # to avoid instantiating multiple times. # this is for POS tags self.PoS_tags_to_indices = {} for idx, tag in enumerate(self.nlp.tagger.tag_names): self.PoS_tags_to_indices[tag] = idx self.number_tagger = index_numbers.NumberTagger() self.n_tags = len(self.nlp.tagger.tag_names) # check if we're loading in a pre-trained model if architecture_path is not None: assert (weights_path is not None) print("loading model architecture from file: %s" % architecture_path) with open(architecture_path) as model_arch: model_arch_str = model_arch.read() self.model = model_from_json(model_arch_str) self.model.load_weights(weights_path)
def build_training_data(csv_path="ctgov_sample_sizes.csv"): # read in data, drop citations w/no abstract df = pd.read_csv(csv_path).dropna(subset=["ab"]) # for segmentation nlp = spacy.load('en') # normalize numbers number_tagger = index_numbers.NumberTagger() ### # 1. tokenize abstract # 2. find sample size(s) # 3. create label vector ### df['ab_numbers'] = df['ab'].apply(lambda x: number_tagger.swap(x)) df['tokenized_ab'] = df['ab_numbers'].apply(lambda x: list(nlp(x))) label_vectors = [] for row_tuple in df.iterrows(): row = row_tuple[1] nums_to_labels = { row["enrolled_totals"]: "N", row["enrolled_P1"]: "n1", row["enrolled_P2"]: "n2" } y = annotate(row["tokenized_ab"], nums_to_labels) label_vectors.append(y) df["y"] = label_vectors return df
def __init__(self, preprocessor, architecture_path=None, weights_path=None): ''' Optionally allow a path to a (keras formatted) JSON model architecture specification and associated set of weights -- this allows easy loading and re-instantiation of trained models. ''' self.preprocessor = preprocessor self.nlp = tokenizer.nlp # this is for POS tags self.PoS_tags_to_indices = {} self.tag_names = [u'""', u'#', u'$', u"''", u',', u'-LRB-', u'-RRB-', u'.', u':', u'ADD', u'AFX', u'BES', u'CC', u'CD', u'DT', u'EX', u'FW', u'GW', u'HVS', u'HYPH', u'IN', u'JJ', u'JJR', u'JJS', u'LS', u'MD', u'NFP', u'NIL', u'NN', u'NNP', u'NNPS', u'NNS', u'PDT', u'POS', u'PRP', u'PRP$', u'RB', u'RBR', u'RBS', u'RP', u'SP', u'SYM', u'TO', u'UH', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ', u'WDT', u'WP', u'WP$', u'WRB', u'XX', u'``'] for idx, tag in enumerate(self.tag_names): self.PoS_tags_to_indices[tag] = idx self.n_tags = len(self.tag_names) # threshold governing whether to abstain from predicting # this as a sample size altogether (for highest scoring # integer). As always, this was definitely set in a totally # scientifically sound way ;). self.magic_threshold = 0.0205 self.number_tagger = index_numbers.NumberTagger() # check if we're loading in a pre-trained model if architecture_path is not None: assert(weights_path is not None) with open(architecture_path) as model_arch: model_arch_str = model_arch.read() self.model = model_from_json(model_arch_str) self.model.load_weights(weights_path)
def __init__(self, preprocessor, architecture_path=None, weights_path=None): ''' Optionally allow a path to a (kera's formatted) JSON model architecture specification and associated set of weights -- this allows easy loading and re-instantiation of trained models. ''' self.preprocessor = preprocessor self.nlp = spacy.load('en') # to avoid instantiating multiple times. # this is for POS tags self.PoS_tags_to_indices = {} for idx, tag in enumerate(self.nlp.tagger.tag_names): self.PoS_tags_to_indices[tag] = idx self.n_tags = len(self.nlp.tagger.tag_names) # threshold governing whether to abstain from predicting # this as a sample size altogether (for highest scoring # integer). As always, this was definitely set in a totally # scientifically sound way ;). self.magic_threshold = 0.0205 self.number_tagger = index_numbers.NumberTagger() # check if we're loading in a pre-trained model if architecture_path is not None: assert (weights_path is not None) print("loading model architecture from file: %s" % architecture_path) with open(architecture_path) as model_arch: model_arch_str = model_arch.read() self.model = model_from_json(doc_arch_str) self.model.load_weights(weights_path)