def wce_features(LABELS): '''CRF features for Web Content Extractor''' features = Features(LABELS) for label in LABELS: # keywords for word in "Copyright|All Rights Reserved|広告掲載|会社概要|無断転載|プライバシーポリシー|利用規約|お問い合わせ|トラックバック|ニュースリリース|新着|無料|確認メール|コメントする|アソシエイト|プロフィール|カレンダー|カテゴリー|ログイン|検索|トップ|個人情報|".split('|'): features.add_feature( lambda x, y, w=word, l=label: 1 if x.has(w) and y == l else 0 ) #features.add_feature( lambda x, y, w=word, l=label: 1 if re.search(w, x.org_text, re.I) and y == l else 0 ) # html tags for tag in "a|p|div|span|ul|ol|li|br|dl|dt|dd|table|tr|td|h1|h2|h3|h4|h5|h6|b|i|center|strong|big|small|meta|form|input|select|option|object|img|iframe|noscript".split('|'): features.add_feature( lambda x, y, t=tag, l=label: 1 if y == l and x[t] > 0 else 0 ) features.add_feature( lambda x, y, t=tag, l=label: 1 if y == l and x[t] < 3 else 0 ) features.add_feature( lambda x, y, t=tag, l=label: 1 if y == l and x[t] > 5 else 0 ) # date & affiliate link features.add_feature( lambda x, y, l=label: 1 if x.has_date and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.affi_link and y == l else 0 ) # punctuation features.add_feature( lambda x, y, l=label: 1 if x.n_ten==0 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten>0 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten>1 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten>3 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten>5 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_maru==0 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_maru>0 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_maru>1 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_maru>3 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_maru>5 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten+x.n_maru==0 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten+x.n_maru>0 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten+x.n_maru>1 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten+x.n_maru>3 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_ten+x.n_maru>5 and y == l else 0 ) # text length features.add_feature( lambda x, y, l=label: 1 if x.len_text==0 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.len_text>10 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.len_text>20 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.len_text>50 and y == l else 0 ) # linked rate features.add_feature( lambda x, y, l=label: 1 if x.linked_rate>0.8 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.linked_rate<0.2 and y == l else 0 ) # label bigram for label1 in features.labels: features.add_feature( lambda x, y, l=label1: 1 if y == l else 0 ) features.add_feature_edge( lambda y_, y, l=label1: 1 if y_ == l else 0 ) for label2 in features.labels: features.add_feature_edge( lambda y_, y, l1=label1, l2=label2: 1 if y_ == l1 and y == l2 else 0 ) return features
def main(): def load_data(data): texts = [] labels = [] text = [] data = "\n" + data + "\n" for line in data.split("\n"): line = line.strip() if len(line) == 0: if len(text) > 0: texts.append(text) labels.append(label) text = [] label = [] else: token, info, chunk = line.split() text.append((token, info)) label.append(chunk) return (texts, labels) texts, labels = load_data(""" This DT B-NP temblor-prone JJ I-NP city NN I-NP dispatched VBD B-VP inspectors NNS B-NP , , O firefighters NNS B-NP and CC O other JJ B-NP earthquake-trained JJ I-NP personnel NNS I-NP to TO B-VP aid VB I-VP San NNP B-NP Francisco NNP I-NP . . O """) print texts, labels test_texts, test_labels = load_data(""" Rockwell NNP B-NP said VBD B-VP the DT B-NP agreement NN I-NP calls VBZ B-VP for IN B-SBAR it PRP B-NP to TO B-VP supply VB I-VP 200 CD B-NP additional JJ I-NP so-called JJ I-NP shipsets NNS I-NP for IN B-PP the DT B-NP planes NNS I-NP . . O """) features = Features(labels) tokens = dict([(i[0], 1) for x in texts for i in x]).keys() infos = dict([(i[1], 1) for x in texts for i in x]).keys() for label in features.labels: for token in tokens: features.add_feature(lambda x, y, l=label, t=token: 1 if y == l and x[0] == t else 0) for info in infos: features.add_feature(lambda x, y, l=label, i=info: 1 if y == l and x[1] == i else 0) features.add_feature_edge(lambda y_, y: 0) fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)] fv = fvs[0] text_fv = FeatureVector(features, test_texts[0]) # text sequence without labels crf = CRF(features, 0) theta0 = crf.random_param() print "initial log likelihood:", crf.likelihood(fvs, theta0) print ">> Steepest Descent" theta = theta0.copy() eta = 0.5 t = time.time() for i in range(20): theta += eta * crf.gradient_likelihood(fvs, theta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, ( numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> SGD" theta = theta0.copy() eta = 0.5 t = time.time() for i in range(20): for fv in fvs: theta += eta * crf.gradient_likelihood([fv], theta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, ( numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> SGD + FOBOS L1" theta = theta0.copy() eta = 0.5 lmd = 0.01 t = time.time() for i in range(20): lmd_eta = lmd * eta for fv in fvs: theta += eta * crf.gradient_likelihood([fv], theta) theta = (theta > lmd_eta) * (theta - lmd_eta) + ( theta < -lmd_eta) * (theta + lmd_eta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, ( numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> Steepest Descent + FOBOS L1" theta = theta0.copy() eta = 0.2 lmd = 0.5 t = time.time() for i in range(20): theta += eta * crf.gradient_likelihood(fvs, theta) lmd_eta = lmd * eta theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * ( theta + lmd_eta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.9 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, ( numpy.abs(theta) > 0.00001).sum(), theta.size) #print theta print ">> BFGS" t = time.time() theta = crf.inference(fvs, theta0) print "log likelihood:", crf.likelihood(fvs, theta) print "time = %.3f, relevant features = %d / %d" % (time.time() - t, ( numpy.abs(theta) > 0.00001).sum(), theta.size)
def wce_features(LABELS): '''CRF features for Web Content Extractor''' features = Features(LABELS) for label in LABELS: # keywords for word in "Copyright|All Rights Reserved|広告掲載|会社概要|無断転載|プライバシーポリシー|利用規約|お問い合わせ|トラックバック|ニュースリリース|新着|無料|確認メール|コメントする|アソシエイト|プロフィール|カレンダー|カテゴリー|ログイン|検索|トップ|個人情報|".split( '|'): features.add_feature(lambda x, y, w=word, l=label: 1 if x.has(w) and y == l else 0) #features.add_feature( lambda x, y, w=word, l=label: 1 if re.search(w, x.org_text, re.I) and y == l else 0 ) # html tags for tag in "a|p|div|span|ul|ol|li|br|dl|dt|dd|table|tr|td|h1|h2|h3|h4|h5|h6|b|i|center|strong|big|small|meta|form|input|select|option|object|img|iframe|noscript".split( '|'): features.add_feature(lambda x, y, t=tag, l=label: 1 if y == l and x[t] > 0 else 0) features.add_feature(lambda x, y, t=tag, l=label: 1 if y == l and x[t] < 3 else 0) features.add_feature(lambda x, y, t=tag, l=label: 1 if y == l and x[t] > 5 else 0) # date & affiliate link features.add_feature(lambda x, y, l=label: 1 if x.has_date and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.affi_link and y == l else 0) # punctuation features.add_feature(lambda x, y, l=label: 1 if x.n_ten == 0 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten > 0 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten > 1 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten > 3 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten > 5 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_maru == 0 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_maru > 0 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_maru > 1 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_maru > 3 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_maru > 5 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten + x.n_maru == 0 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten + x.n_maru > 0 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten + x.n_maru > 1 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten + x.n_maru > 3 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_ten + x.n_maru > 5 and y == l else 0) # text length features.add_feature(lambda x, y, l=label: 1 if x.len_text == 0 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.len_text > 10 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.len_text > 20 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.len_text > 50 and y == l else 0) # linked rate features.add_feature(lambda x, y, l=label: 1 if x.linked_rate > 0.8 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.linked_rate < 0.2 and y == l else 0) # label bigram for label1 in features.labels: features.add_feature(lambda x, y, l=label1: 1 if y == l else 0) features.add_feature_edge(lambda y_, y, l=label1: 1 if y_ == l else 0) for label2 in features.labels: features.add_feature_edge(lambda y_, y, l1=label1, l2=label2: 1 if y_ == l1 and y == l2 else 0) return features
def pg_features(LABELS): '''CRF features for Project Gutenberg Content Extractor''' features = Features(LABELS) for label in LABELS: # keywords for word in "project/gutenberg/e-?text/ebook/copyright/chapter/scanner/David Reed/encoding/contents/file/zip/web/http/email/newsletter/public domain/donation/archive/ascii/produced/end of (the)? project gutenberg/PREFACE/INTRODUCTION/Language:/Release Date:/Character set/refund/LIMITED RIGHT".split( '/'): features.add_feature(lambda x, y, w=word, l=label: 1 if x.has(w) and y == l else 0) # type case features.add_feature(lambda x, y, l=label: 1 if x.all_upper and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'[A-Z]{3}') and y == l else 0) # numeric features.add_feature(lambda x, y, l=label: 1 if x.has(r'[0-9]') and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'[0-9]{2}') and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'[0-9]{3}') and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'[0-9]{4}') and y == l else 0) # line head features.add_feature(lambda x, y, l=label: 1 if x.linehead[' '] >= 2 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.linehead[' '] >= 4 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.linehead['*'] >= 1 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.linehead['*'] >= 2 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.linehead['*'] >= 3 and y == l else 0) # line tail features.add_feature(lambda x, y, l=label: 1 if x.linetail['*'] >= 1 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.linetail['*'] >= 2 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.linetail['*'] >= 3 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'\n\n$') and y == l else 0) # symbols features.add_feature(lambda x, y, l=label: 1 if x.has(r'@') and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'#') and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'\?') and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.has(r'\[') and y == l else 0) # line number features.add_feature(lambda x, y, l=label: 1 if x.n_lines == 1 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_lines == 2 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_lines == 3 and y == l else 0) features.add_feature(lambda x, y, l=label: 1 if x.n_lines > 3 and y == l else 0) # labels for label1 in features.labels: features.add_feature(lambda x, y, l=label1: 1 if y == l else 0) features.add_feature_edge(lambda y_, y, l=label1: 1 if y_ == l else 0) for label2 in features.labels: features.add_feature_edge(lambda y_, y, l1=label1, l2=label2: 1 if y_ == l1 and y == l2 else 0) return features
def pg_features(LABELS): '''CRF features for Project Gutenberg Content Extractor''' features = Features(LABELS) for label in LABELS: # keywords for word in "project/gutenberg/e-?text/ebook/copyright/chapter/scanner/David Reed/encoding/contents/file/zip/web/http/email/newsletter/public domain/donation/archive/ascii/produced/end of (the)? project gutenberg/PREFACE/INTRODUCTION/Language:/Release Date:/Character set/refund/LIMITED RIGHT".split('/'): features.add_feature( lambda x, y, w=word, l=label: 1 if x.has(w) and y == l else 0 ) # type case features.add_feature( lambda x, y, l=label: 1 if x.all_upper and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'[A-Z]{3}') and y == l else 0 ) # numeric features.add_feature( lambda x, y, l=label: 1 if x.has(r'[0-9]') and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'[0-9]{2}') and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'[0-9]{3}') and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'[0-9]{4}') and y == l else 0 ) # line head features.add_feature( lambda x, y, l=label: 1 if x.linehead[' ']>=2 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.linehead[' ']>=4 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.linehead['*']>=1 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.linehead['*']>=2 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.linehead['*']>=3 and y == l else 0 ) # line tail features.add_feature( lambda x, y, l=label: 1 if x.linetail['*']>=1 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.linetail['*']>=2 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.linetail['*']>=3 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'\n\n$') and y == l else 0 ) # symbols features.add_feature( lambda x, y, l=label: 1 if x.has(r'@') and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'#') and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'\?') and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.has(r'\[') and y == l else 0 ) # line number features.add_feature( lambda x, y, l=label: 1 if x.n_lines==1 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_lines==2 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_lines==3 and y == l else 0 ) features.add_feature( lambda x, y, l=label: 1 if x.n_lines>3 and y == l else 0 ) # labels for label1 in features.labels: features.add_feature( lambda x, y, l=label1: 1 if y == l else 0 ) features.add_feature_edge( lambda y_, y, l=label1: 1 if y_ == l else 0 ) for label2 in features.labels: features.add_feature_edge( lambda y_, y, l1=label1, l2=label2: 1 if y_ == l1 and y == l2 else 0 ) return features
def main(): def load_data(data): texts = [] labels = [] text = [] data = "\n" + data + "\n" for line in data.split("\n"): line = line.strip() if len(line) == 0: if len(text)>0: texts.append(text) labels.append(label) text = [] label = [] else: token, info, chunk = line.split() text.append((token, info)) label.append(chunk) return (texts, labels) texts, labels = load_data(""" This DT B-NP temblor-prone JJ I-NP city NN I-NP dispatched VBD B-VP inspectors NNS B-NP , , O firefighters NNS B-NP and CC O other JJ B-NP earthquake-trained JJ I-NP personnel NNS I-NP to TO B-VP aid VB I-VP San NNP B-NP Francisco NNP I-NP . . O """) print texts, labels test_texts, test_labels = load_data(""" Rockwell NNP B-NP said VBD B-VP the DT B-NP agreement NN I-NP calls VBZ B-VP for IN B-SBAR it PRP B-NP to TO B-VP supply VB I-VP 200 CD B-NP additional JJ I-NP so-called JJ I-NP shipsets NNS I-NP for IN B-PP the DT B-NP planes NNS I-NP . . O """) features = Features(labels) tokens = dict([(i[0],1) for x in texts for i in x]).keys() infos = dict([(i[1],1) for x in texts for i in x]).keys() for label in features.labels: for token in tokens: features.add_feature( lambda x, y, l=label, t=token: 1 if y==l and x[0]==t else 0 ) for info in infos: features.add_feature( lambda x, y, l=label, i=info: 1 if y==l and x[1]==i else 0 ) features.add_feature_edge( lambda y_, y: 0 ) fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)] fv = fvs[0] text_fv = FeatureVector(features, test_texts[0]) # text sequence without labels crf = CRF(features, 0) theta0 = crf.random_param() print "initial log likelihood:", crf.likelihood(fvs, theta0) print ">> Steepest Descent" theta = theta0.copy() eta = 0.5 t = time.time() for i in range(20): theta += eta * crf.gradient_likelihood(fvs, theta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> SGD" theta = theta0.copy() eta = 0.5 t = time.time() for i in range(20): for fv in fvs: theta += eta * crf.gradient_likelihood([fv], theta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> SGD + FOBOS L1" theta = theta0.copy() eta = 0.5 lmd = 0.01 t = time.time() for i in range(20): lmd_eta = lmd * eta for fv in fvs: theta += eta * crf.gradient_likelihood([fv], theta) theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.95 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) print ">> Steepest Descent + FOBOS L1" theta = theta0.copy() eta = 0.2 lmd = 0.5 t = time.time() for i in range(20): theta += eta * crf.gradient_likelihood(fvs, theta) lmd_eta = lmd * eta theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta) print i, "log likelihood:", crf.likelihood(fvs, theta) eta *= 0.9 print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) #print theta print ">> BFGS" t = time.time() theta = crf.inference(fvs, theta0) print "log likelihood:", crf.likelihood(fvs, theta) print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)