def main(**kwargs): limit = kwargs.pop('limit') cat_mems = get_category("ArticleHistory", limit) page_ids = [c.pageid for c in cat_mems if c.ns == 1] concurrency = kwargs.pop('concurrency') chunk_size = kwargs.pop('grouping') pages = [] histories = [] am = ProgressMeter(total=len(page_ids), unit="articles", ticks=30) for cpages in chunked_pimap(get_articles, page_ids, parsed=False, concurrency=concurrency, chunk_size=chunk_size): for p in cpages: am.update(1) pages.append(p) try: ah = ArticleHistory.from_page(p) except ValueError as ve: print ve continue histories.append(ah) sorted_histories = sorted(histories, key=lambda x: x.last_updated) print len(sorted_histories), "histories retrieved and parsed, but didn't do much else" import pdb;pdb.set_trace()
def build_links(self, db): """ Analyze the original page, and rebulid the link-relationship. """ print "Building links' connections." conn = sqlite3.connect(db) cur = conn.cursor() conn.text_factory = str dbname = db[:-3] sql = "select url from %s" % dbname urls = [ url[0] for url in cur.execute(sql).fetchall()] urlids = self.urls2ids(urls) from_urls = dict([(urlid,[]) for urlid in urlids]) to_urls = dict([(urlid,[]) for urlid in urlids]) progress = ProgressMeter(total=len(urls)) for (cnt, url) in enumerate(urls): urlid = self.get_urlid(url) p = MyHTMLParser(url) sql = "select content from %s where url='%s'" % (dbname, url) content = cur.execute(sql).fetchone()[0] try: p.feed(content) except: ferrmsg('Error: feed error in %s.' % url, 'Rank') to_urls[urlid] = self.urls2ids(p.htm_urls()) for lid in to_urls[urlid]: if lid not in from_urls.keys(): continue else: from_urls[lid].append(urlid) # update the progress if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1: progress.update(cnt+1) self.url_ids = urlids self.from_ids = from_urls self.to_ids = to_urls
def getVarAndTimeJG(sims, start, end): latS = 28. latN = 80. # latS = -34. # latN = -34. var = [] time = [] ii = 0 for sim in sims: dataPath = '/glade/scratch/jfyke/%s/run' % (sim) meter = ProgressMeter(total=end[ii] + 1 - start[ii]) for yr in range(start[ii], end[ii] + 1): year = '%04i' % yr Files = sorted( glob.glob('%s/%s.pop.h.%s-??.nc' % (dataPath, sim, year))) dd = dataServer(Files) var.append(dd.extractTimeSeries(latS=latS, latN=latN)) meter.update(1) ii += 1 var = squeeze(ma.array(var).filled().astype("f")) # var = ma.array(var).filled().astype("f") time = arange(len(var)) print var return var, time
def read_measurement(self, folder): """ This function contains the logic to read in measurement folders. First, it finds out what it has to do and creates a list of instructions. Then it starts to process those jobs in parallel. :param folder: The path to the folder that contains the measurment. :type folder: str. """ avg_folders = [] for filename in os.listdir(folder): if re.match(self.avg_folder_match, filename): avg_folders.append(filename) xmlfiles = [] for avg_folder in avg_folders: for dirname, dirnames, filenames in os.walk(os.path.join(folder,avg_folder)): for filename in filenames: xmlmatch = re.match(self.xml_match, filename) if xmlmatch: vals = [int(val) for val in xmlmatch.groups()] date = datetime(vals[0],vals[1],vals[2],vals[3],vals[4],vals[5],vals[6]*10000) image_file, bg_file = Measurement.other_files_for_xml(filename) if not os.path.isfile(os.path.join(dirname, image_file)): raise NameError('No TIFF image found for XML %s.' % filename) if not os.path.isfile(os.path.join(dirname, bg_file)): raise NameError('No background TIFF image found for XML %s.' % filename) xmlfiles.append({'date': date, 'd': dirname, 'f': filename, 'avg': re.match(self.avg_folder_match, avg_folder).groups()[0]}) if xmlfiles == []: raise NameError("This folder doesn't seem to contain measurement data") total = len(xmlfiles) pm = ProgressMeter(total=total) ## Parallel processing of the files finished = False num_processes = cpu_count() i = 0 p = Pool(processes=num_processes) manager = Manager() queue = manager.Queue() result = p.map_async(process_MeasurementPoint_QueueWrapper, [(xmlfile, queue) for xmlfile in xmlfiles]) while not finished: if not queue.empty(): #print("Processed XML file %s." % queue.get()) queue.get() i += 1 if i == total: finished = True if i % num_processes == 0: pm.update(num_processes) else: time.sleep(0.02) if i % num_processes != 0: pm.update(i % num_processes) self.measurementPoints = result.get() ## Sequential processing of the files #self.measurementPoints = [] #for xmlfile in xmlfiles: # self.measurementPoints.append(process_MeasurementPoint(xmlfile)) # pm.update(1) self.after_process()
def getMonth(self, Year=1958, Month=1, Daily=None, TimeZoneOffset=0): # Return 1 month of data. # Keep iterating over day of month until exception # occurs, marking end of month print 'Getting %s %s %s' %(self.Field,Year,Month) Ndays = self.getNdaysInMonth(Year,Month) meter = ProgressMeter(total=Ndays) f = [] for Day in range(1,Ndays+1): meter.update(1) x = self.getDay(Year,Month,Day,Daily,TimeZoneOffset) if Daily is None: f.extend( x.tolist() ) else: f.append(x) return array(f)
def getMonth(self, Year=1958, Month=1, Daily=None, TimeZoneOffset=0): # Return 1 month of data. # Keep iterating over day of month until exception # occurs, marking end of month print 'Getting %s %s %s' % (self.Field, Year, Month) Ndays = self.getNdaysInMonth(Year, Month) meter = ProgressMeter(total=Ndays) f = [] for Day in range(1, Ndays + 1): meter.update(1) x = self.getDay(Year, Month, Day, Daily, TimeZoneOffset) if Daily is None: f.extend(x.tolist()) else: f.append(x) return np.array(f)
def getTimeSlice(self, DateStart = (1958,1,1,0), DateEnd = (1958,12,31,18) ): print 'DataServer: Getting timeslice %s to %s' % (DateStart,DateEnd) h0 = self.getHours(*DateStart) h1 = self.getHours(*DateEnd) N = int((h1-h0)/6+1) f = self.snapshot(*self.getDate(h0)) shape = (N,) + f.shape if hasattr(f,'mask'): f = ma.zeros(shape,dtype=float) else: f = zeros(shape,dtype=float) meter = ProgressMeter(total=N) for l in range(N): meter.update(1) f[l] = self.snapshot(*self.getDate(h0)) h0 += 6 return f
def index(self, db): """ Index the given database. Index steps consist of: 1, seperate the content into individual words. 2, record each word. 3, calculate the term frequency in current page. Note: index process is time-wasting. """ conn = sqlite3.connect(db) cur = conn.cursor() conn.text_factory = str dbname = db[:-3] sql = "select url from %s" % dbname urls = [ url[0] for url in cur.execute(sql).fetchall()] progress = ProgressMeter(total=len(urls)) # traverse all webpages for (cnt, url) in enumerate(urls): urlid = self.getid('urllist','url',url) sql = "select content from %s where url='%s'" % (dbname, url) html = cur.execute(sql).fetchone()[0] items = self.getitems(html) title = replace_quote(items['title']) sql = "insert into urltitle values(%d,'%s')" % (urlid, title) self.cur.execute(sql) content = items['content'] words = self.analyzer.run(content) tfdir = {} # traverse all words in current webpage for i in range(len(words)): word = words[i] if word not in tfdir: tfdir[word] = 1 else: tfdir[word] += 1 wordid = self.getid('wordlist','word',word) sql = "insert into wordlocation values(%d,%d,%d)" % (urlid, wordid, i) self.cur.execute(sql) for (word, tf) in tfdir.items(): wordid = self.getid('wordlist','word',word) sql = "insert into wordinfo values(%d,%d,%f)" % \ (urlid, wordid, float(tf)/len(words)) self.cur.execute(sql) # update the progress if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1: progress.update(cnt+1) del progress cur.close()
def getTimeSlice(self, DateStart = (1958,1,1,0), DateEnd = (1958,12,31,18) ): print ' -- Getting timeslice %s to %s' % (DateStart,DateEnd) h0 = self.getHours(*DateStart) h1 = self.getHours(*DateEnd) N = int((h1-h0)/6+1) f = self.snapshot(*self.getDate(h0)) shape = (N,) + f.shape if hasattr(f,'mask'): f = ma.zeros(shape,dtype=float) else: f = zeros(shape,dtype=float) meter = ProgressMeter(total=N) for l in range(N): meter.update(1) f[l] = self.snapshot(*self.getDate(h0)) h0 += 6 return f
def __call__(self, field, lon, lat): """ input field on regular lat-lon grid output field on regular projection grid """ if len(field.shape) == 2: field, lon = basemap.addcyclic(field, lon) field, lon = basemap.shiftgrid(180, field, lon, start=False) self.field = self.m.transform_scalar(field, lon, lat, self.nx, self.ny) elif len(field.shape) == 3: n = field.shape[0] self.field = np.zeros((n, self.ny, self.nx), dtype='f') for l in range(n): field1, lon1 = basemap.addcyclic(field[l], lon) field1, lon1 = basemap.shiftgrid(180, field1, lon1, start=False) self.field[l] = self.m.transform_scalar( field1, lon1, lat, self.nx, self.ny) elif len(field.shape) == 4: n0 = field.shape[0] n1 = field.shape[1] if hasattr(field, 'mask'): self.field = np.ma.zeros((n0, n1, self.ny, self.nx), dtype=float) else: self.field = np.zeros((n0, n1, self.ny, self.nx), dtype=float) print 'LambertProjector: Projecting 4D field' m = ProgressMeter(total=n0 * n1) for l0 in range(n0): for l1 in range(n1): field1, lon1 = basemap.addcyclic(field[l0, l1], lon) field1, lon1 = basemap.shiftgrid(180, field1, lon1, start=False) self.field[l0, l1] = self.m.transform_scalar( field1, lon1, lat, self.nx, self.ny) m.update(1) return self.field
def get_category_recursive(cat_name, count=None): ret = set() seen_cats = set() if count is None: count = ALL print 'Recursively getting all members of', cat_name else: print 'Recursively getting',count,'members of', cat_name jobs = [] api_pool = Pool(CAT_CONC) jobs.append(api_pool.spawn(get_category, cat_name, count)) dpm = ProgressMeter(total=count, unit="categories", ticks=30) while len(ret) < count and jobs: cur_count = count - len(ret) api_pool.join(timeout=0.3, raise_error=True) for j in jobs: if not j.ready(): continue jobs.remove(j) if not j.successful(): print 'failed a cat fetch' continue cur_mems = j.value for m in cur_mems: if m.ns == 14: if m.title not in seen_cats: jobs.append(api_pool.spawn(get_category, m.title, cur_count)) seen_cats.add(m.title) else: ret.add(m) dpm.update(1) dpm.update(count - len(ret)) ret = list(ret)[:count] print 'Done, returning', len(ret),'category members.' return list(ret)
def Seasonal(Field='U', Season='DJF', Source='ERA40', \ YearStart=None, YearStop=None): # instatiate data server data = DataServer(Field=Field,Source=Source) if YearStart is None: YearStart = data.FirstYear if YearStop is None: YearStop = data.LastYear assert YearStart >= data.FirstYear,\ '\nFirst year in dataset is %s' % data.FirstYear assert YearStop <= data.LastYear,\ '\nLast year in dataset is %s' % data.LastYear # create output file FileName = '%s.%s.%s.%s-%s.nc' % (Field,Season,Source,YearStart,YearStop) File = CreateOutputFile(FileName,data) print 'Creating %s'%FileName TimeIndex = 0 meter = ProgressMeter(total=YearStop-YearStart+1) for Year in range(YearStart,YearStop+1): meter.update(1) # get 1 season of data SeasonData = data.getSeason(Year,Season) File.variables['time'][TimeIndex] = float(Year) File.variables[Field][TimeIndex] = SeasonData.astype('f') TimeIndex += 1 File.close()
def save_a_bunch(count=DEFAULT_LIMIT, category=DEFAULT_CAT, concurrency=DEFAULT_CONC, per_call=DEFAULT_PER_CALL, db_name=DEFAULT_DB): import time page_ids = get_dab_page_ids(category, count) dabblets = [] dpm = ProgressMeter(total=len(page_ids), unit="articles", ticks=30) for pages in chunked_pimap(get_articles, page_ids, concurrency=concurrency, chunk_size=per_call): for p in pages: dpm.update(1) cur_dabs = get_dabblets(p) dabblets.extend(cur_dabs) print print 'Saving', len(dabblets), 'dabblets.' dspm = ProgressMeter(total=len(dabblets), unit="dabblets", ticks=30) dsave_start = time.time() for d in dabblets: d.save() for img in d.source_imgs: dab_img = DabImage(dabblet=d, src=img) dab_img.save() dspm.update(1) print print 'Done saving', len(dabblets), 'Dabblets. (', time.time()-dsave_start,'seconds)' print 'Processing choices for', len(dabblets), 'Dabblets.' cpm = ProgressMeter(total=len(page_ids), unit="Dabblets", ticks=30) all_choices = [] for choices in chunked_pimap(get_dab_choices, dabblets, concurrency=concurrency, chunk_size=per_call): cpm.update(per_call) all_choices.extend(choices) print print 'Saving', len(all_choices), 'DabChoices.' cspm = ProgressMeter(total=len(all_choices), unit="DabChoices", ticks=30) csave_start = time.time() for c in all_choices: c.save() cspm.update(1) print 'Done saving', len(dabblets), 'DabChoices. (', time.time()-csave_start,'seconds)' drank_start = time.time() print 'Ranking', len(dabblets), 'Dabblets.' for d in dabblets: d.priority = d.get_priority() d.save() print 'Done ranking', len(dabblets), 'DabChoices. (', time.time()-drank_start,'seconds)' print len(set([d.title for d in dabblets])), 'unique titles' print len(set([d.source_title for d in dabblets])), 'unique source pages' print len(all_choices), 'dabblet choices fetched and saved.' print Dabblet.select().count(), 'total records in database' print len(set([d.title for d in Dabblet.select()])), 'unique titles in database' print 'Committing...' return dabblets