def main():
    import getopt

    def usage():
        print '''usage: extract.py [-d)ebug] [-S)trict] [-t pat_threshold] [-T diffscore_threshold] [-M mainscore_threshold] [-c default_charset] [-C codec_out] [-a accept_pat] [-j reject_pat] [-P mangle_pat] patfile zipfile ...'''
        sys.exit(2)

    try:
        (opts, args) = getopt.getopt(sys.argv[1:], 'dSt:T:M:c:C:a:j:P:')
    except getopt.GetoptError:
        usage()
    (debug, pat_threshold, diffscore_threshold, mainscore_threshold, default_charset, codec_out, strict) = \
            (0, 0.8, 0.5, 50, 'iso-8859-1', 'utf-8', False)
    acldb = None
    mangle_pat = None
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-S': strict = True
        elif k == '-t': pat_threshold = float(v)
        elif k == '-T': diffscore_threshold = float(v)
        elif k == '-M': mainscore_threshold = float(v)
        elif k == '-c': default_charset = v
        elif k == '-C': codec_out = v
        elif k == '-a':
            if not acldb: acldb = ACLDB()
            acldb.add_allow(v)
        elif k == '-j':
            if not acldb: acldb = ACLDB()
            acldb.add_deny(v)
        elif k == '-P':
            mangle_pat = v
    if not args:
        usage()
    patternset = LayoutPatternSet(debug=debug)
    fp = file(args[0])
    patternset.read(fp)
    fp.close()
    if mangle_pat:
        patternset.set_encoder(mangle_pat)
    del args[0]
    consumer = TextExtractor(patternset,
                             pat_threshold,
                             diffscore_threshold,
                             mainscore_threshold,
                             default_charset=default_charset,
                             codec_out=codec_out,
                             strict=strict,
                             debug=debug)
    if not args:
        args = ['-']
    for fname in args:
        if fname.endswith('.zip'):
            ZipLoader(consumer, fname, acldb=acldb, debug=debug).run()
        elif fname == '-':
            consumer.feed_page('stdin', sys.stdin)
        else:
            fp = file(fname)
            consumer.feed_page(fname, fp)
            fp.close()
    return
Example #2
0
def main():
  import getopt
  def usage():
    print '''usage: textcrawler.py -o outfile [-d] [-b baseid] [-a accept_pat] [-j reject_pat]
    [-i index_html] [-m level] [-k cookie_file] [-c default_charset]
    [-U urldb] [-D delay] [-T timeout] [-L linkinfo] [url ...]'''
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], 'db:a:j:i:m:k:c:C:U:o:D:T:L:')
  except getopt.GetoptError:
    usage()
  (debug, maxlevel, cookie_file, delay) = (0, 1, None, 0)
  (index_html, default_charset, urldb, timeout) = ('', 'iso-8859-1', None, 300)
  (baseid, outfile, linkinfo) = (None, None, 'linkinfo')
  reftxtdb = None
  acldb = None
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-b': baseid = v
    elif k == '-a':
      if not acldb: acldb = ACLDB()
      acldb.add_allow(v)
    elif k == '-j':
      if not acldb: acldb = ACLDB()
      acldb.add_deny(v)
    elif k == '-m': maxlevel = int(v)
    elif k == '-i': index_html = v
    elif k == '-k': cookie_file = v
    elif k == '-c': default_charset = v
    elif k == '-U': urldb = URLDB(v)
    elif k == '-D': delay = int(v)
    elif k == '-o': outfile = v
    elif k == '-T': timeout = int(v)
    elif k == '-L': linkinfo = v
  if not args:
    usage()
  if not baseid:
    baseid = time.strftime('%Y%m%d%H%M')
  if not acldb:
    acldb = ACLDB()
    acldb.add_deny(r'\.(jpg|jpeg|gif|png|tiff|swf|mov|wmv|wma|ram|rm|rpm|gz|zip|class)\b')
    for starturl in args:
      acldb.add_allow('^'+re.escape(urljoin(starturl, '.')))
  if linkinfo:
    reftxtdb = RefTextDB(baseid)
  dumper = None
  if outfile:
    dumper = ZipDumper(outfile, baseid)
  else:
    dumper = NullDumper()               # crawling only
  for starturl in args:
    try:
      TextCrawler(dumper, starturl, baseid, reftxtdb=reftxtdb,
                  index_html=index_html, maxlevel=maxlevel, 
                  cookie_file=cookie_file, default_charset=default_charset,
                  acldb=acldb, urldb=urldb, delay=delay, timeout=timeout,
                  debug=debug).run()
    except CrawlerFatalError:
      pass
  if linkinfo:
    dumper.feed_page(linkinfo, reftxtdb.dump())
  dumper.close()
  return
def main():
    import getopt

    def usage():
        print(
            'usage: analyze.py [-d] [-t cluster_threshold] [-T title_threshold]'
            ' [-S score_threshold] [-m max_sample] [-L linkinfo] [-c default_charset]'
            ' [-a accept_pat] [-j reject_pat] [-P mangle_pat] files ...')
        sys.exit(2)

    try:
        (opts, args) = getopt.getopt(sys.argv[1:], 'dt:T:S:m:L:c:a:j:P:')
    except getopt.GetoptError:
        usage()
    debug = 0
    cluster_threshold = 0.97
    title_threshold = 0.6
    score_threshold = 100
    max_sample = 0
    default_charset = 'utf-8'
    acldb = None
    mangle_pat = None
    linkinfo = 'linkinfo'
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-t': cluster_threshold = float(v)
        elif k == '-T': title_threshold = float(v)
        elif k == '-S': score_threshold = float(v)
        elif k == '-m': max_sample = int(v)
        elif k == '-L': linkinfo = ''
        elif k == '-c': default_charset = v
        elif k == '-a':
            if not acldb: acldb = ACLDB()
            acldb.add_allow(v)
        elif k == '-j':
            if not acldb: acldb = ACLDB()
            acldb.add_deny(v)
        elif k == '-P':
            mangle_pat = v
    if not args:
        usage()
    #
    analyzer = LayoutAnalyzer(debug=debug)
    if mangle_pat:
        analyzer.set_encoder(mangle_pat)
    print '### version=%s' % WEBSTEMMER_VERSION
    for fname in args:
        print '### fname=%r' % fname
        feeder = PageFeeder(analyzer,
                            linkinfo=linkinfo,
                            acldb=acldb,
                            default_charset=default_charset,
                            debug=debug)
        if fname.endswith('.zip'):
            ZipLoader(feeder, fname, debug=debug).run()
        elif fname.endswith('.list') or fname == '-':
            if fname == '-':
                fp = sys.stdin
            else:
                fp = file(fname)
            for line in fp:
                name = line.strip()
                if debug:
                    print >> stderr, 'Loading: %r' % name
                fp2 = file(name)
                data = fp2.read()
                fp2.close()
                feeder.feed_page(name, data)
            fp.close()
        else:
            fp = file(fname)
            data = fp.read()
            fp.close()
            feeder.feed_page(fname, data)
        feeder.close()
    print '### cluster_threshold=%f' % cluster_threshold
    print '### title_threshold=%f' % title_threshold
    print '### pages=%d' % len(analyzer.pages)
    print
    if mangle_pat:
        print '!mangle_pat=%r' % mangle_pat
        print
    for c in analyzer.analyze(cluster_threshold, title_threshold, max_sample):
        if c.pattern and score_threshold <= c.score:
            c.dump()
    return
Example #4
0
def main():
    import getopt

    def usage():
        print '''usage: textcrawler.py -o outfile [-d] [-b baseid] [-a accept_pat] [-j reject_pat]
    [-i index_html] [-m level] [-k cookie_file] [-c default_charset]
    [-U urldb] [-D delay] [-T timeout] [-L linkinfo] [url ...]'''
        sys.exit(2)

    try:
        (opts, args) = getopt.getopt(sys.argv[1:],
                                     'db:a:j:i:m:k:c:C:U:o:D:T:L:')
    except getopt.GetoptError:
        usage()
    (debug, maxlevel, cookie_file, delay) = (0, 1, None, 0)
    (index_html, default_charset, urldb, timeout) = ('', 'iso-8859-1', None,
                                                     300)
    (baseid, outfile, linkinfo) = (None, None, 'linkinfo')
    reftxtdb = None
    acldb = None
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-b': baseid = v
        elif k == '-a':
            if not acldb: acldb = ACLDB()
            acldb.add_allow(v)
        elif k == '-j':
            if not acldb: acldb = ACLDB()
            acldb.add_deny(v)
        elif k == '-m':
            maxlevel = int(v)
        elif k == '-i':
            index_html = v
        elif k == '-k':
            cookie_file = v
        elif k == '-c':
            default_charset = v
        elif k == '-U':
            urldb = URLDB(v)
        elif k == '-D':
            delay = int(v)
        elif k == '-o':
            outfile = v
        elif k == '-T':
            timeout = int(v)
        elif k == '-L':
            linkinfo = v
    if not args:
        usage()
    if not baseid:
        baseid = time.strftime('%Y%m%d%H%M')
    if not acldb:
        acldb = ACLDB()
        acldb.add_deny(
            r'\.(jpg|jpeg|gif|png|tiff|swf|mov|wmv|wma|ram|rm|rpm|gz|zip|class)\b'
        )
        for starturl in args:
            acldb.add_allow('^' + re.escape(urljoin(starturl, '.')))
    if linkinfo:
        reftxtdb = RefTextDB(baseid)
    dumper = None
    if outfile:
        dumper = ZipDumper(outfile, baseid)
    else:
        dumper = NullDumper()  # crawling only
    for starturl in args:
        try:
            TextCrawler(dumper,
                        starturl,
                        baseid,
                        reftxtdb=reftxtdb,
                        index_html=index_html,
                        maxlevel=maxlevel,
                        cookie_file=cookie_file,
                        default_charset=default_charset,
                        acldb=acldb,
                        urldb=urldb,
                        delay=delay,
                        timeout=timeout,
                        debug=debug).run()
        except CrawlerFatalError:
            pass
    if linkinfo:
        dumper.feed_page(linkinfo, reftxtdb.dump())
    dumper.close()
    return
Example #5
0
def main():
  import getopt
  def usage():
    print ('usage: analyze.py [-d] [-t cluster_threshold] [-T title_threshold]'
           ' [-S score_threshold] [-m max_sample] [-L linkinfo] [-c default_charset]'
           ' [-a accept_pat] [-j reject_pat] [-P mangle_pat] files ...')
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], 'dt:T:S:m:L:c:a:j:P:')
  except getopt.GetoptError:
    usage()
  debug = 0
  cluster_threshold = 0.97
  title_threshold = 0.6
  score_threshold = 100
  max_sample = 0
  default_charset = 'utf-8'
  acldb = None
  mangle_pat = None
  linkinfo = 'linkinfo'
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-t': cluster_threshold = float(v)
    elif k == '-T': title_threshold = float(v)
    elif k == '-S': score_threshold = float(v)
    elif k == '-m': max_sample = int(v)
    elif k == '-L': linkinfo = ''
    elif k == '-c': default_charset = v
    elif k == '-a':
      if not acldb: acldb = ACLDB()
      acldb.add_allow(v)
    elif k == '-j':
      if not acldb: acldb = ACLDB()
      acldb.add_deny(v)
    elif k == '-P':
      mangle_pat = v
  if not args:
    usage()
  #
  analyzer = LayoutAnalyzer(debug=debug)
  if mangle_pat:
    analyzer.set_encoder(mangle_pat)
  print '### version=%s' % WEBSTEMMER_VERSION
  for fname in args:
    print '### fname=%r' % fname
    feeder = PageFeeder(analyzer, linkinfo=linkinfo, acldb=acldb,
                        default_charset=default_charset, debug=debug)
    if fname.endswith('.zip'):
      ZipLoader(feeder, fname, debug=debug).run()
    elif fname.endswith('.list') or fname == '-':
      if fname == '-':
        fp = sys.stdin
      else:
        fp = file(fname)
      for line in fp:
        name = line.strip()
        if debug:
          print >>stderr, 'Loading: %r' % name
        fp2 = file(name)
        data = fp2.read()
        fp2.close()
        feeder.feed_page(name, data)
      fp.close()
    else:
      fp = file(fname)
      data = fp.read()
      fp.close()
      feeder.feed_page(fname, data)
    feeder.close()
  print '### cluster_threshold=%f' % cluster_threshold
  print '### title_threshold=%f' % title_threshold
  print '### pages=%d' % len(analyzer.pages)
  print
  if mangle_pat:
    print '!mangle_pat=%r' % mangle_pat
    print
  for c in analyzer.analyze(cluster_threshold, title_threshold, max_sample):
    if c.pattern and score_threshold <= c.score:
      c.dump()
  return