Esempio n. 1
0
def main():
    options, args = create_option_parser().parse_args()

    if options.ver:
        from __version__ import print_version_info
        print_version_info('clean_corpus.py')
        exit(0)

    files = []
    for f in args:
        if os.path.exists(f):
            if os.path.isdir(f):
                for fn in os.listdir(f):
                    if fn.endswith('.txt') and not os.path.isdir(fn):
                        files.append(os.path.join(f, fn))
            else:
                files.append(f)

    if not files:
        print 'No input files specified.'
        exit(1)

    global badwords
    global goodwords

    badwords = options.badfile and set(
        [w.lower() for w in open(options.badfile).read().split()]) or set()
    goodwords = options.goodfile and set(
        [w.lower() for w in open(options.goodfile).read().split()]) or set()
    allwords = []

    dictfiles = get_file_dict(files)
    dictfiles.sort(key=lambda d: d['filename'])
    for f in dictfiles:
        allwords.extend(
            process_file(open(f['filename']),
                         remove_bad=options.removebad,
                         output_list=options.list))

    if options.list:
        allwords = list(set(allwords))
        allwords.sort()

    for word in allwords:
        print word
Esempio n. 2
0
def main():
    options, args = create_option_parser().parse_args()

    if options.ver:
        from __version__ import print_version_info
        print_version_info('clean_corpus.py')
        exit(0)

    files = []
    for f in args:
        if os.path.exists(f):
            if os.path.isdir(f):
                for fn in os.listdir(f):
                    if fn.endswith('.txt') and not os.path.isdir(fn):
                        files.append(os.path.join(f, fn))
            else:
                files.append(f)

    if not files:
        print 'No input files specified.'
        exit(1)

    global badwords
    global goodwords

    badwords  = options.badfile  and set([w.lower() for w in open(options.badfile).read().split() ]) or set()
    goodwords = options.goodfile and set([w.lower() for w in open(options.goodfile).read().split()]) or set()
    allwords  = []

    dictfiles = get_file_dict(files)
    dictfiles.sort(key=lambda d: d['filename'])
    for f in dictfiles:
        allwords.extend( process_file(open(f['filename']), remove_bad=options.removebad, output_list=options.list) )

    if options.list:
        allwords = list(set(allwords))
        allwords.sort()

    for word in allwords:
        print word
def main():
    """Main entry-point for command-line usage."""
    options, args = create_option_parser().parse_args()

    if options.ver:
        from __version__ import print_version_info
        print_version_info('corpus_collect.py')
        exit(0)

    outputdir = options.outputdir
    l = options.tuplelength
    n = options.numelements
    q = options.quiet
    u = options.urls
    e = options.pattern

    f = os.sys.stdin

    skipTuples   = False # You can't really skip this step without having other input to work with.
    skipUrls     = options.skipurls
    skipDownload = options.skipdownloads
    skipConvert  = options.skipconvert

    if options.tuplefile:
        skipTuples = True
    if options.urlfile:
        skipTuples = skipUrls = True
    if options.pagedir:
        skipTuples = skipUrls = skipDownload = True

    #print 'Skipping options: %s %s %s' % (skipTuples, skipUrls, skipDownload)

    # Step 1: Get search tuples
    if skipTuples:
        searchtuples = ()
        if not skipUrls and not skipDownload:
            tuplefile = open(options.tuplefile)
            searchtuples = tuple([ line[:-1] for line in tuplefile.readlines() ])
    else:
        if args:
            if os.path.exists(args[0]):
                f = open(args[0])
            else:
                print usage
                exit(1)

        if not q:
            print _('Reading seeds from %s') % (f.name)
        seeds = f.read().split()

        searchtuples = build_random_tuples(seeds, n=n, l=l, outdir=outputdir, talkative=not q)

    # Step 2: Collect URLs from search results of searching for search tuples
    if skipUrls:
        urls = ()
        if not skipDownload:
            urlfile = open(options.urlfile)
            urls = tuple([ line[:-1] for line in urlfile.readlines() ])
    else:
        urls = get_urls(searchtuples, count=u, outdir=outputdir, talkative=not q)

    # Step 3: Download the URLs to the local machine
    if skipDownload:
        if not os.path.isdir(options.pagedir):
            raise ValueError(_('Not a valid directory: %s') % (options.pagedir))

        localpages = []

        def htmlfilter(arg, dir, files):
            files[:] = [f for f in files if f.endswith('html')]
            for f in files:
                localpages.append( os.path.join(dir, f) )

        os.path.walk(options.pagedir, htmlfilter, None)
        localpages = tuple(localpages)
    else:
        localpages = download_urls(
            urls,
            outdir=outputdir,
            crawldepth=options.crawldepth,
            siteonly=options.siteonly,
            talkative=not q,
            pattern=options.pattern
        )

    # Step 4: Convert downloaded pages to text
    if not skipConvert:
        texts = convert_pages(localpages, outdir=outputdir, talkative=not q)
Esempio n. 4
0
def main():
    """Main entry-point for command-line usage."""
    options, args = create_option_parser().parse_args()

    if options.ver:
        from __version__ import print_version_info
        print_version_info('corpus_collect.py')
        exit(0)

    outputdir = options.outputdir
    l = options.tuplelength
    n = options.numelements
    q = options.quiet
    u = options.urls
    e = options.pattern

    f = os.sys.stdin

    skipTuples = False  # You can't really skip this step without having other input to work with.
    skipUrls = options.skipurls
    skipDownload = options.skipdownloads
    skipConvert = options.skipconvert

    if options.tuplefile:
        skipTuples = True
    if options.urlfile:
        skipTuples = skipUrls = True
    if options.pagedir:
        skipTuples = skipUrls = skipDownload = True

    #print 'Skipping options: %s %s %s' % (skipTuples, skipUrls, skipDownload)

    # Step 1: Get search tuples
    if skipTuples:
        searchtuples = ()
        if not skipUrls and not skipDownload:
            tuplefile = open(options.tuplefile)
            searchtuples = tuple([line[:-1] for line in tuplefile.readlines()])
    else:
        if args:
            if os.path.exists(args[0]):
                f = open(args[0])
            else:
                print usage
                exit(1)

        if not q:
            print _('Reading seeds from %s') % (f.name)
        seeds = f.read().split()

        searchtuples = build_random_tuples(seeds,
                                           n=n,
                                           l=l,
                                           outdir=outputdir,
                                           talkative=not q)

    # Step 2: Collect URLs from search results of searching for search tuples
    if skipUrls:
        urls = ()
        if not skipDownload:
            urlfile = open(options.urlfile)
            urls = tuple([line[:-1] for line in urlfile.readlines()])
    else:
        urls = get_urls(searchtuples,
                        count=u,
                        outdir=outputdir,
                        talkative=not q)

    # Step 3: Download the URLs to the local machine
    if skipDownload:
        if not os.path.isdir(options.pagedir):
            raise ValueError(
                _('Not a valid directory: %s') % (options.pagedir))

        localpages = []

        def htmlfilter(arg, dir, files):
            files[:] = [f for f in files if f.endswith('html')]
            for f in files:
                localpages.append(os.path.join(dir, f))

        os.path.walk(options.pagedir, htmlfilter, None)
        localpages = tuple(localpages)
    else:
        localpages = download_urls(urls,
                                   outdir=outputdir,
                                   crawldepth=options.crawldepth,
                                   siteonly=options.siteonly,
                                   talkative=not q,
                                   pattern=options.pattern)

    # Step 4: Convert downloaded pages to text
    if not skipConvert:
        texts = convert_pages(localpages, outdir=outputdir, talkative=not q)