def __init__(self,
                 preprocessor,
                 architecture_path=None,
                 weights_path=None):
        '''
        Optionally allow a path to a (kera's formatted) JSON model architecture
        specification and associated set of weights -- this allows easy loading
        and re-instantiation of trained models.
        '''
        self.preprocessor = preprocessor

        self.nlp = spacy.load('en')  # to avoid instantiating multiple times.

        # this is for POS tags
        self.PoS_tags_to_indices = {}
        for idx, tag in enumerate(self.nlp.tagger.tag_names):
            self.PoS_tags_to_indices[tag] = idx

        self.number_tagger = index_numbers.NumberTagger()

        self.n_tags = len(self.nlp.tagger.tag_names)

        # check if we're loading in a pre-trained model
        if architecture_path is not None:
            assert (weights_path is not None)

            print("loading model architecture from file: %s" %
                  architecture_path)
            with open(architecture_path) as model_arch:
                model_arch_str = model_arch.read()
                self.model = model_from_json(model_arch_str)

            self.model.load_weights(weights_path)
def build_training_data(csv_path="ctgov_sample_sizes.csv"):
    # read in data, drop citations w/no abstract
    df = pd.read_csv(csv_path).dropna(subset=["ab"])

    # for segmentation
    nlp = spacy.load('en')

    # normalize numbers
    number_tagger = index_numbers.NumberTagger()

    ###
    # 1. tokenize abstract
    # 2. find sample size(s)
    # 3. create label vector
    ###
    df['ab_numbers'] = df['ab'].apply(lambda x: number_tagger.swap(x))
    df['tokenized_ab'] = df['ab_numbers'].apply(lambda x: list(nlp(x)))

    label_vectors = []
    for row_tuple in df.iterrows():
        row = row_tuple[1]
        nums_to_labels = {
            row["enrolled_totals"]: "N",
            row["enrolled_P1"]: "n1",
            row["enrolled_P2"]: "n2"
        }
        y = annotate(row["tokenized_ab"], nums_to_labels)
        label_vectors.append(y)
    df["y"] = label_vectors
    return df
    def __init__(self, preprocessor, architecture_path=None, weights_path=None):
        '''
        Optionally allow a path to a (keras formatted) JSON model architecture
        specification and associated set of weights -- this allows easy loading
        and re-instantiation of trained models.
        '''
        self.preprocessor = preprocessor
        self.nlp = tokenizer.nlp
        # this is for POS tags
        self.PoS_tags_to_indices = {}
        self.tag_names = [u'""', u'#', u'$', u"''", u',', u'-LRB-', u'-RRB-', u'.', u':', u'ADD', u'AFX', u'BES', u'CC', u'CD', u'DT', u'EX', u'FW', u'GW', u'HVS', u'HYPH', u'IN', u'JJ', u'JJR', u'JJS', u'LS', u'MD', u'NFP', u'NIL', u'NN', u'NNP', u'NNPS', u'NNS', u'PDT', u'POS', u'PRP', u'PRP$', u'RB', u'RBR', u'RBS', u'RP', u'SP', u'SYM', u'TO', u'UH', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ', u'WDT', u'WP', u'WP$', u'WRB', u'XX', u'``']
        for idx, tag in enumerate(self.tag_names):
            self.PoS_tags_to_indices[tag] = idx

        self.n_tags = len(self.tag_names)

        # threshold governing whether to abstain from predicting
        # this as a sample size altogether (for highest scoring
        # integer). As always, this was definitely set in a totally
        # scientifically sound way ;).
        self.magic_threshold = 0.0205

        self.number_tagger = index_numbers.NumberTagger()

        # check if we're loading in a pre-trained model
        if architecture_path is not None:
            assert(weights_path is not None)

            
            with open(architecture_path) as model_arch:
                model_arch_str = model_arch.read()
                self.model = model_from_json(model_arch_str)

            self.model.load_weights(weights_path)
Example #4
0
    def __init__(self,
                 preprocessor,
                 architecture_path=None,
                 weights_path=None):
        '''
        Optionally allow a path to a (kera's formatted) JSON model architecture
        specification and associated set of weights -- this allows easy loading
        and re-instantiation of trained models.
        '''
        self.preprocessor = preprocessor
        self.nlp = spacy.load('en')  # to avoid instantiating multiple times.
        # this is for POS tags
        self.PoS_tags_to_indices = {}
        for idx, tag in enumerate(self.nlp.tagger.tag_names):
            self.PoS_tags_to_indices[tag] = idx

        self.n_tags = len(self.nlp.tagger.tag_names)

        # threshold governing whether to abstain from predicting
        # this as a sample size altogether (for highest scoring
        # integer). As always, this was definitely set in a totally
        # scientifically sound way ;).
        self.magic_threshold = 0.0205

        self.number_tagger = index_numbers.NumberTagger()

        # check if we're loading in a pre-trained model
        if architecture_path is not None:
            assert (weights_path is not None)

            print("loading model architecture from file: %s" %
                  architecture_path)
            with open(architecture_path) as model_arch:
                model_arch_str = model_arch.read()
                self.model = model_from_json(doc_arch_str)

            self.model.load_weights(weights_path)