def active_features(self, verbose=1): # XXX: We probably don't want to do this here. However, we *do* run it # here so that the active set is guaranteed to be the right size. if self.group_budget is not None: self.dense.prox_budget(self.group_budget) active = [c for c in self.C if self.dense.w[self.context_feature_id(c)] != 0] #assert len(active) == np.sum(w != 0), 'active %s, nonzero %s' % (len(active), np.sum(w != 0)) #self.check_L0_group_norm_proxy(self.dense) if verbose: print('%s: %s out of %s' % (colors.yellow % 'active', len(active), len(self.C)), end=' ') B = groupby2(active, len) print('(budget %s, sizes %s)' % (self.group_budget, ', '.join('%s: %s' % (z, len(B[z])) for z in sorted(B)))) return active
def add_features(self): items = self.pages[0].items # above/below abstract abstracts = [x for x in items if x.abstract] if len(abstracts) == 1: [abstract] = abstracts for x in items: x.attributes['above-abstract'] = x.yoffset < abstract.yoffset else: # TODO: handle no abstracts or many abstracts pass # extract local features for page in self.pages: for x in page.items: feature_extraction(x) # fontsize frequency fontsize = Counter(x.fontsize for x in items) freq = zip(fontsize.values(), fontsize.keys()) freq.sort(reverse=True) rank = {k: rank + 1 for rank, (v, k) in enumerate(freq)} for x in items: x.attributes['fontsize-freq-rank'] = rank[x.fontsize] # width frequency w = Counter(int(x.width) for x in items) freq = zip(w.values(), w.keys()) freq.sort(reverse=True) rank = {k: rank + 1 for rank, (v, k) in enumerate(freq)} for x in items: x.attributes['width-rank'] = rank[int(x.width)] # fontsize rank fontsize = groupby2(items, lambda x: x.fontsize) for rank, (_, vs) in enumerate(reversed(sorted(fontsize.items()))): for v in vs: v.attributes['fontsize-size-rank'] = rank + 1
def extract_title(filename, extra=True): EXPERIMENTAL_AUTHOR_EXTRACTION = 1 if EXPERIMENTAL_AUTHOR_EXTRACTION: A = authors_set() if not isinstance(filename, basestring): pdf = filename filename = pdf.filename else: filename = re.sub('^file://', '', filename) try: pdf = pdfminer(filename) except KeyboardInterrupt: raise except: return # check for skid-mark # if os.path.exists(filename + '.d/notes.org'): # from skid.add import Document # d = Document(filename) # meta = d.parse_notes() # print meta.get(u'title', None) # print meta.get(u'author', None) if not pdf: return page = pdf.pages[0].items # preprocessing page = [ x for x in page # Need to find a three+ letter word begining with a capital letter to be # considered a candidate for author or title. if re.findall('[A-Z][A-Za-z][A-Za-z]+', x.text) ] # Capitalization filter: Titles (almost) always have at least one # capitalized three-letter word. # # - TODO: discards multiline titles where the second line doesn't have any # capitalized words. # TODO: Other observations to take advantage of: Titles tend not to have # single initial, unlike names, (both title and author precede the word # "abstract") g = groupby2(page, key=lambda x: x.fontsize) if not g: return title = ' '.join(x.text for x in g[max(g)]) # Clean up case if all caps if title.isupper(): title = title.title() print yellow % title.encode('utf8') if extra: # timv: this is sort of a proxy for author extraction. If it's easy to # copy-paste the authors maybe we don't need to have automatic # extraction. # # - authors often appear in a distinguishing (infrequent) font. # # - text of the document should be the most-frequent font (Although, # sometimes the authors aren't in a distinguished font). # g = groupby2(page, key=lambda x: x.fontname) freq = [(len(v), k, v) for k, v in g.iteritems()] freq.sort() for count, key, items in freq: print print red % count, green % key for x in items[:15]: x = x.text.encode('utf8') if EXPERIMENTAL_AUTHOR_EXTRACTION: # similarity to existing list of authors aa = [(sim(a, simplify(x), n=3), a) for a in A] aa = [(s, a) for s, a in aa if s > 0.2] aa.sort(reverse=1) print yellow % x, ('%s %s' % (red % '->', aa[:5])) if aa else '' else: print yellow % x extract_year(freq) return title
def _main(args): with timeit('load data'): corpus = CoNLL_U('data/UD/{lang}/UD_{lang}'.format(lang=args.lang), tag_type=args.tag_type) if args.quick: corpus.train = corpus.train[:100] corpus.dev = corpus.train[:0] allowed_contexts = None if args.context_count is not None: print 'context count filter threshold %s' % args.context_count max_order = args.initial_order + args.outer_iterations, if args.max_order is not None: max_order = args.max_order allowed_contexts = contexts_by_count(corpus, max_order, args.context_count) print 'allowed_contexts:', len(allowed_contexts) B = groupby2(allowed_contexts, len) print '(sizes %s)' % (', '.join('%s: %s' % (z, len(B[z])) for z in sorted(B))) if 0: # things that survived the threshold. for k, v in B.items(): if k >= 10: # context size >= 10 print print k for vv in v: print '-'.join(vv) pl.plot(B.keys(), map(len, B.values())) pl.show() if 0: max_order = args.outer_iterations C = {} for n in xrange(1, max_order + 1): # initial order + num iters C.update(corpus.tag_ngram_counts(n=n)) pl.scatter(map(len, C.keys()), C.values(), lw=0, alpha=0.5) pl.show() elif args.max_order is not None: allowed_contexts = prefix_closure( fixed_order_contexts(corpus.Y, order=args.max_order)) print 'allowed_contexts:', len(allowed_contexts) A = ActiveSet(corpus, Y=corpus.Y, train=corpus.make_instances('train', Instance), dev=corpus.make_instances('dev', Instance), group_budget=args.budget, regularizer=args.C, outer_iterations=args.outer_iterations, inner_iterations=args.inner_iterations, initial_contexts=fixed_order_contexts( corpus.Y, args.initial_order), allowed_contexts=allowed_contexts, no_failure_arcs=args.baseline, dump=args.dump) A.active_set()
def extract_title(filename, extra=True): EXPERIMENTAL_AUTHOR_EXTRACTION = 1 if EXPERIMENTAL_AUTHOR_EXTRACTION: A = authors_set() if not isinstance(filename, basestring): pdf = filename filename = pdf.filename else: filename = re.sub('^file://', '', filename) try: pdf = pdfminer(filename) except KeyboardInterrupt: raise except: return # check for skid-mark # if os.path.exists(filename + '.d/notes.org'): # from skid.add import Document # d = Document(filename) # meta = d.parse_notes() # print meta.get(u'title', None) # print meta.get(u'author', None) if not pdf: return page = pdf.pages[0].items # preprocessing page = [x for x in page # Need to find a three+ letter word begining with a capital letter to be # considered a candidate for author or title. if re.findall('[A-Z][A-Za-z][A-Za-z]+', x.text)] # Capitalization filter: Titles (almost) always have at least one # capitalized three-letter word. # # - TODO: discards multiline titles where the second line doesn't have any # capitalized words. # TODO: Other observations to take advantage of: Titles tend not to have # single initial, unlike names, (both title and author precede the word # "abstract") g = groupby2(page, key=lambda x: x.fontsize) if not g: return title = ' '.join(x.text for x in g[max(g)]) # Clean up case if all caps if title.isupper(): title = title.title() print yellow % title.encode('utf8') if extra: # timv: this is sort of a proxy for author extraction. If it's easy to # copy-paste the authors maybe we don't need to have automatic # extraction. # # - authors often appear in a distinguishing (infrequent) font. # # - text of the document should be the most-frequent font (Although, # sometimes the authors aren't in a distinguished font). # g = groupby2(page, key=lambda x: x.fontname) freq = [(len(v), k, v) for k,v in g.iteritems()] freq.sort() for count, key, items in freq: print print red % count, green % key for x in items[:15]: x = x.text.encode('utf8') if EXPERIMENTAL_AUTHOR_EXTRACTION: # similarity to existing list of authors aa = [(sim(a, simplify(x), n=3), a) for a in A] aa = [(s, a) for s, a in aa if s > 0.2] aa.sort(reverse=1) print yellow % x, ('%s %s' % (red % '->', aa[:5])) if aa else '' else: print yellow % x extract_year(freq) return title
def extract_title(filename, extra=True): if not isinstance(filename, basestring): pdf = filename filename = pdf.filename else: try: pdf = pdfminer(filename) except KeyboardInterrupt: raise except: return # check for skid-mark # if os.path.exists(filename + '.d/notes.org'): # from skid.add import Document # d = Document(filename) # meta = d.parse_notes() # print meta.get(u'title', None) # print meta.get(u'author', None) if not pdf: return page = pdf.pages[0].items # preprocessing page = [x for x in page # Need to find a three+ letter word begining with a capital letter to be # considered a candidate for author or title. if re.findall('[A-Z][A-Za-z][A-Za-z]+', x.text)] # TODO: titles tend not to have single initial, unlike names, (both title # and author precede the word "abstract") g = groupby2(page, key=lambda x: x.fontsize) if not g: return title = ' '.join(x.text for x in g[max(g)]) # Clean up case if all caps if title.isupper(): title = title.title() print yellow % title.encode('utf8') if extra: # timv: this is sort of a proxy for author extraction. If it's easy to # copy-paste the authors maybe we don't need to have automatic extraction. # # authors often appear in a distinguishing (infrequent) font g = groupby2(page, key=lambda x: x.fontname) freq = [(len(v), k, v) for k,v in g.iteritems()] freq.sort() for count, key, items in freq: print print red % count, green % key for x in items[:10]: print yellow % x.text.encode('utf8') return title