def _get_mention_splits(doc, mention, seek, span): mention_start_seek_offset = _.index_of(doc[seek:], mention) mention_start_sentence_offset = seek - span[0] + mention_start_seek_offset to_idx = mention_start_sentence_offset + len(mention) sentence = doc[span[0]:span[1]] return ([parse_for_tokens(sentence[:mention_start_sentence_offset] + mention), parse_for_tokens(mention + sentence[to_idx:])], span[0] + to_idx)
def check_overlap(ranks_1, ranks_2): agree_ctr = 0 num_combos = 0 for ranks_1, ranks_2 in zip(ranks_1, ranks_2): for doc_1, doc_2 in combinations(ranks_1, 2): num_combos += 1 d_1_in_2 = _.index_of(ranks_2[:len(ranks_1)], doc_1) d_2_in_2 = _.index_of(ranks_2[:len(ranks_1)], doc_2) d_1_in_1 = _.index_of(ranks_1, doc_1) d_2_in_1 = _.index_of(ranks_1, doc_2) if d_1_in_2 == -1: continue if d_2_in_2 == -1: agree_ctr += 1 continue if (d_1_in_1 < d_2_in_1) == (d_1_in_2 < d_2_in_2): agree_ctr += 1 return agree_ctr, num_combos
def adult(): feature_names = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country' ] cat_mapping = { 'workclass': [ 'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked' ], 'education': [ 'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool' ], 'marital-status': [ 'Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse' ], 'occupation': [ 'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces' ], 'relationship': [ 'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried' ], 'race': [ 'White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black' ], 'sex': ['Female', 'Male'], 'native-country': [ 'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands' ] } df = pd.read_csv('./data/adult.data', header=None, names=feature_names + ['target']) for col_name, options in cat_mapping.items(): df[col_name] = df[col_name].apply( lambda x: _.index_of(options, x.strip())) df[col_name] = df[col_name].apply(lambda x: x if (x != -1) else max(df[col_name]) + 1) target_names = ['<=50K', '>50K'] df.target = df.target.apply(lambda x: '>' in x) return df, feature_names, cat_mapping, target_names
def _get_splits(documents, mentions): all_splits = [] doc_sentence_spans = [parse_for_sentence_spans(doc) for doc in documents] mention_idx = 0 for doc, spans in zip(documents, doc_sentence_spans): seek = 0 while mention_idx < len(mentions): mention = mentions[mention_idx] mention_start_offset = _.index_of(doc[seek:], mention) if mention_start_offset == -1: mention_start_offset = _.index_of(doc[seek:], re.sub(' +', ' ', ' , '.join(' . '.join(mention.split('.')).split(','))).replace('D . C .', 'D.C.')) if mention_start_offset == -1: break mention_start_idx = mention_start_offset + seek mention_end_idx = mention_start_idx + len(mention) span = _create_span(spans, mention_start_idx, mention_end_idx) splits, seek = _get_mention_splits(doc, mention, seek, span) all_splits.append(splits) mention_idx += 1 return all_splits
def test_index_of(case, value, from_index, expected): assert _.index_of(case, value, from_index) == expected
def _get_mention_sentence(doc, mention, seek, span): mention_start_seek_offset = _.index_of(doc[seek:], mention) mention_start_sentence_offset = seek - span[0] + mention_start_seek_offset to_idx = mention_start_sentence_offset + len(mention) sentence = doc[span[0]:span[1]] return (parse_for_tokens(sentence), span[0] + to_idx)