def main(): options, args = create_option_parser().parse_args() if options.ver: from __version__ import print_version_info print_version_info('clean_corpus.py') exit(0) files = [] for f in args: if os.path.exists(f): if os.path.isdir(f): for fn in os.listdir(f): if fn.endswith('.txt') and not os.path.isdir(fn): files.append(os.path.join(f, fn)) else: files.append(f) if not files: print 'No input files specified.' exit(1) global badwords global goodwords badwords = options.badfile and set( [w.lower() for w in open(options.badfile).read().split()]) or set() goodwords = options.goodfile and set( [w.lower() for w in open(options.goodfile).read().split()]) or set() allwords = [] dictfiles = get_file_dict(files) dictfiles.sort(key=lambda d: d['filename']) for f in dictfiles: allwords.extend( process_file(open(f['filename']), remove_bad=options.removebad, output_list=options.list)) if options.list: allwords = list(set(allwords)) allwords.sort() for word in allwords: print word
def main(): options, args = create_option_parser().parse_args() if options.ver: from __version__ import print_version_info print_version_info('clean_corpus.py') exit(0) files = [] for f in args: if os.path.exists(f): if os.path.isdir(f): for fn in os.listdir(f): if fn.endswith('.txt') and not os.path.isdir(fn): files.append(os.path.join(f, fn)) else: files.append(f) if not files: print 'No input files specified.' exit(1) global badwords global goodwords badwords = options.badfile and set([w.lower() for w in open(options.badfile).read().split() ]) or set() goodwords = options.goodfile and set([w.lower() for w in open(options.goodfile).read().split()]) or set() allwords = [] dictfiles = get_file_dict(files) dictfiles.sort(key=lambda d: d['filename']) for f in dictfiles: allwords.extend( process_file(open(f['filename']), remove_bad=options.removebad, output_list=options.list) ) if options.list: allwords = list(set(allwords)) allwords.sort() for word in allwords: print word
def main(): """Main entry-point for command-line usage.""" options, args = create_option_parser().parse_args() if options.ver: from __version__ import print_version_info print_version_info('corpus_collect.py') exit(0) outputdir = options.outputdir l = options.tuplelength n = options.numelements q = options.quiet u = options.urls e = options.pattern f = os.sys.stdin skipTuples = False # You can't really skip this step without having other input to work with. skipUrls = options.skipurls skipDownload = options.skipdownloads skipConvert = options.skipconvert if options.tuplefile: skipTuples = True if options.urlfile: skipTuples = skipUrls = True if options.pagedir: skipTuples = skipUrls = skipDownload = True #print 'Skipping options: %s %s %s' % (skipTuples, skipUrls, skipDownload) # Step 1: Get search tuples if skipTuples: searchtuples = () if not skipUrls and not skipDownload: tuplefile = open(options.tuplefile) searchtuples = tuple([ line[:-1] for line in tuplefile.readlines() ]) else: if args: if os.path.exists(args[0]): f = open(args[0]) else: print usage exit(1) if not q: print _('Reading seeds from %s') % (f.name) seeds = f.read().split() searchtuples = build_random_tuples(seeds, n=n, l=l, outdir=outputdir, talkative=not q) # Step 2: Collect URLs from search results of searching for search tuples if skipUrls: urls = () if not skipDownload: urlfile = open(options.urlfile) urls = tuple([ line[:-1] for line in urlfile.readlines() ]) else: urls = get_urls(searchtuples, count=u, outdir=outputdir, talkative=not q) # Step 3: Download the URLs to the local machine if skipDownload: if not os.path.isdir(options.pagedir): raise ValueError(_('Not a valid directory: %s') % (options.pagedir)) localpages = [] def htmlfilter(arg, dir, files): files[:] = [f for f in files if f.endswith('html')] for f in files: localpages.append( os.path.join(dir, f) ) os.path.walk(options.pagedir, htmlfilter, None) localpages = tuple(localpages) else: localpages = download_urls( urls, outdir=outputdir, crawldepth=options.crawldepth, siteonly=options.siteonly, talkative=not q, pattern=options.pattern ) # Step 4: Convert downloaded pages to text if not skipConvert: texts = convert_pages(localpages, outdir=outputdir, talkative=not q)
def main(): """Main entry-point for command-line usage.""" options, args = create_option_parser().parse_args() if options.ver: from __version__ import print_version_info print_version_info('corpus_collect.py') exit(0) outputdir = options.outputdir l = options.tuplelength n = options.numelements q = options.quiet u = options.urls e = options.pattern f = os.sys.stdin skipTuples = False # You can't really skip this step without having other input to work with. skipUrls = options.skipurls skipDownload = options.skipdownloads skipConvert = options.skipconvert if options.tuplefile: skipTuples = True if options.urlfile: skipTuples = skipUrls = True if options.pagedir: skipTuples = skipUrls = skipDownload = True #print 'Skipping options: %s %s %s' % (skipTuples, skipUrls, skipDownload) # Step 1: Get search tuples if skipTuples: searchtuples = () if not skipUrls and not skipDownload: tuplefile = open(options.tuplefile) searchtuples = tuple([line[:-1] for line in tuplefile.readlines()]) else: if args: if os.path.exists(args[0]): f = open(args[0]) else: print usage exit(1) if not q: print _('Reading seeds from %s') % (f.name) seeds = f.read().split() searchtuples = build_random_tuples(seeds, n=n, l=l, outdir=outputdir, talkative=not q) # Step 2: Collect URLs from search results of searching for search tuples if skipUrls: urls = () if not skipDownload: urlfile = open(options.urlfile) urls = tuple([line[:-1] for line in urlfile.readlines()]) else: urls = get_urls(searchtuples, count=u, outdir=outputdir, talkative=not q) # Step 3: Download the URLs to the local machine if skipDownload: if not os.path.isdir(options.pagedir): raise ValueError( _('Not a valid directory: %s') % (options.pagedir)) localpages = [] def htmlfilter(arg, dir, files): files[:] = [f for f in files if f.endswith('html')] for f in files: localpages.append(os.path.join(dir, f)) os.path.walk(options.pagedir, htmlfilter, None) localpages = tuple(localpages) else: localpages = download_urls(urls, outdir=outputdir, crawldepth=options.crawldepth, siteonly=options.siteonly, talkative=not q, pattern=options.pattern) # Step 4: Convert downloaded pages to text if not skipConvert: texts = convert_pages(localpages, outdir=outputdir, talkative=not q)