Example #1
0
def main(**kwargs):
    limit = kwargs.pop('limit')
    cat_mems = get_category("ArticleHistory", limit)
    page_ids = [c.pageid for c in cat_mems if c.ns == 1]
    concurrency = kwargs.pop('concurrency')
    chunk_size = kwargs.pop('grouping')
    pages = []
    histories = []
    am = ProgressMeter(total=len(page_ids), unit="articles", ticks=30)
    for cpages in chunked_pimap(get_articles,
                                page_ids,
                                parsed=False,
                                concurrency=concurrency,
                                chunk_size=chunk_size):
        for p in cpages:
            am.update(1)
            pages.append(p)
            try:
                ah = ArticleHistory.from_page(p)
            except ValueError as ve:
                print ve
                continue
            histories.append(ah)
        sorted_histories = sorted(histories, key=lambda x: x.last_updated)
    print len(sorted_histories), "histories retrieved and parsed, but didn't do much else"
    import pdb;pdb.set_trace()
Example #2
0
	def build_links(self, db):
		""" Analyze the original page, and rebulid the link-relationship. """
		print "Building links' connections."
		conn = sqlite3.connect(db)
		cur  = conn.cursor()
		conn.text_factory = str
		dbname = db[:-3]
		sql  = "select url from %s" % dbname
		urls = [ url[0] for url in cur.execute(sql).fetchall()]
		
		urlids    = self.urls2ids(urls)
		from_urls = dict([(urlid,[]) for urlid in urlids])
		to_urls   = dict([(urlid,[]) for urlid in urlids])

		progress = ProgressMeter(total=len(urls))
		for (cnt, url) in enumerate(urls):
			urlid = self.get_urlid(url)
			p = MyHTMLParser(url)
			sql = "select content from %s where url='%s'" % (dbname, url)
			content = cur.execute(sql).fetchone()[0]
			try:    p.feed(content)
			except:	ferrmsg('Error: feed error in %s.' % url, 'Rank')
			to_urls[urlid] = self.urls2ids(p.htm_urls())
			for lid in to_urls[urlid]:
				if lid not in from_urls.keys():
					continue
				else:
					from_urls[lid].append(urlid)
			# update the progress
			if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1:
				progress.update(cnt+1)
		self.url_ids  = urlids
		self.from_ids = from_urls
		self.to_ids   = to_urls
def getVarAndTimeJG(sims, start, end):
    latS = 28.
    latN = 80.

    #    latS = -34.
    #    latN = -34.

    var = []
    time = []
    ii = 0
    for sim in sims:
        dataPath = '/glade/scratch/jfyke/%s/run' % (sim)

        meter = ProgressMeter(total=end[ii] + 1 - start[ii])
        for yr in range(start[ii], end[ii] + 1):
            year = '%04i' % yr

            Files = sorted(
                glob.glob('%s/%s.pop.h.%s-??.nc' % (dataPath, sim, year)))

            dd = dataServer(Files)
            var.append(dd.extractTimeSeries(latS=latS, latN=latN))
            meter.update(1)

        ii += 1

    var = squeeze(ma.array(var).filled().astype("f"))
    #    var = ma.array(var).filled().astype("f")
    time = arange(len(var))

    print var

    return var, time
Example #4
0
    def read_measurement(self, folder):
        """
        This function contains the logic to read in measurement folders.
        First, it finds out what it has to do and creates a list of instructions.
        Then it starts to process those jobs in parallel.

        :param folder: The path to the folder that contains the measurment.
        :type folder: str.

        """
        avg_folders = []
        for filename in os.listdir(folder):
            if re.match(self.avg_folder_match, filename):
                avg_folders.append(filename)
        xmlfiles = []
        for avg_folder in avg_folders:
            for dirname, dirnames, filenames in os.walk(os.path.join(folder,avg_folder)):
                for filename in filenames:
                    xmlmatch = re.match(self.xml_match, filename)
                    if xmlmatch:
                        vals = [int(val) for val in xmlmatch.groups()]
                        date = datetime(vals[0],vals[1],vals[2],vals[3],vals[4],vals[5],vals[6]*10000)
                        image_file, bg_file = Measurement.other_files_for_xml(filename)
                        if not os.path.isfile(os.path.join(dirname, image_file)):
                            raise NameError('No TIFF image found for XML %s.' % filename)
                        if not os.path.isfile(os.path.join(dirname, bg_file)):
                            raise NameError('No background TIFF image found for XML %s.' % filename)
                        xmlfiles.append({'date': date, 'd': dirname, 'f': filename, 'avg': re.match(self.avg_folder_match, avg_folder).groups()[0]})
        if xmlfiles == []:
            raise NameError("This folder doesn't seem to contain measurement data")
        total = len(xmlfiles)
        pm = ProgressMeter(total=total)

        ## Parallel processing of the files
        finished = False
        num_processes = cpu_count()
        i = 0
        p = Pool(processes=num_processes)
        manager = Manager()
        queue = manager.Queue()
        result = p.map_async(process_MeasurementPoint_QueueWrapper, [(xmlfile, queue) for xmlfile in xmlfiles])
        while not finished:
            if not queue.empty():
                #print("Processed XML file %s." % queue.get())
                queue.get()
                i += 1
                if i == total: finished = True
                if i % num_processes == 0: pm.update(num_processes)
            else:
                time.sleep(0.02)
        if i % num_processes != 0: pm.update(i % num_processes)
        self.measurementPoints = result.get()
        ## Sequential processing of the files
        #self.measurementPoints = []
        #for xmlfile in xmlfiles:
        #    self.measurementPoints.append(process_MeasurementPoint(xmlfile))
        #    pm.update(1)

        self.after_process()
 def getMonth(self, Year=1958, Month=1, Daily=None, TimeZoneOffset=0):
     # Return 1 month of data.
     # Keep iterating over day of month until exception
     # occurs, marking end of month
     print 'Getting %s %s %s' %(self.Field,Year,Month)
     Ndays = self.getNdaysInMonth(Year,Month)
     meter = ProgressMeter(total=Ndays)        
     f = []
     for Day in range(1,Ndays+1):
         meter.update(1)
         x = self.getDay(Year,Month,Day,Daily,TimeZoneOffset)
         if Daily is None: f.extend( x.tolist() )
         else: f.append(x)
     return array(f)
 def getMonth(self, Year=1958, Month=1, Daily=None, TimeZoneOffset=0):
     # Return 1 month of data.
     # Keep iterating over day of month until exception
     # occurs, marking end of month
     print 'Getting %s %s %s' % (self.Field, Year, Month)
     Ndays = self.getNdaysInMonth(Year, Month)
     meter = ProgressMeter(total=Ndays)
     f = []
     for Day in range(1, Ndays + 1):
         meter.update(1)
         x = self.getDay(Year, Month, Day, Daily, TimeZoneOffset)
         if Daily is None: f.extend(x.tolist())
         else: f.append(x)
     return np.array(f)
 def getTimeSlice(self, DateStart = (1958,1,1,0), DateEnd = (1958,12,31,18) ):
     print 'DataServer: Getting timeslice %s to %s' % (DateStart,DateEnd)
     h0 = self.getHours(*DateStart)
     h1 = self.getHours(*DateEnd)        
     N = int((h1-h0)/6+1)
     f = self.snapshot(*self.getDate(h0))
     shape = (N,) + f.shape
     if hasattr(f,'mask'):
         f = ma.zeros(shape,dtype=float)
     else:
         f = zeros(shape,dtype=float)
     meter = ProgressMeter(total=N)
     for l in range(N):
         meter.update(1)
         f[l] = self.snapshot(*self.getDate(h0)) 
         h0 += 6
     return f
Example #8
0
	def index(self, db):
		""" Index the given database. 
		Index steps consist of:
		1, seperate the content into individual words.
		2, record each word.
		3, calculate the term frequency in current page. 
		Note: index process is time-wasting. """
		conn = sqlite3.connect(db)
		cur  = conn.cursor()
		conn.text_factory = str
		dbname = db[:-3]
		sql  = "select url from %s" % dbname
		urls = [ url[0] for url in cur.execute(sql).fetchall()]
		progress = ProgressMeter(total=len(urls))
		# traverse all webpages
		for (cnt, url) in enumerate(urls):
			urlid = self.getid('urllist','url',url)
			sql = "select content from %s where url='%s'" % (dbname, url)
			html = cur.execute(sql).fetchone()[0]
			items = self.getitems(html)
			title = replace_quote(items['title'])
			sql = "insert into urltitle values(%d,'%s')" % (urlid, title)
			self.cur.execute(sql)
			content = items['content']
			words = self.analyzer.run(content)
			tfdir = {}
			# traverse all words in current webpage
			for i in range(len(words)):
				word = words[i]
				if word not in tfdir:
					tfdir[word] = 1
				else:
					tfdir[word] += 1
				wordid = self.getid('wordlist','word',word)
				sql = "insert into wordlocation values(%d,%d,%d)" % (urlid, wordid, i)
				self.cur.execute(sql)
			for (word, tf) in tfdir.items():
				wordid = self.getid('wordlist','word',word)
				sql = "insert into wordinfo values(%d,%d,%f)" % \
					  (urlid, wordid, float(tf)/len(words))
				self.cur.execute(sql)
			# update the progress
			if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1:
				progress.update(cnt+1)
		del progress
		cur.close()	
Example #9
0
 def getTimeSlice(self, DateStart = (1958,1,1,0), DateEnd = (1958,12,31,18) ):
     print ' -- Getting timeslice %s to %s' % (DateStart,DateEnd)
     h0 = self.getHours(*DateStart)
     h1 = self.getHours(*DateEnd)        
     N = int((h1-h0)/6+1)
     f = self.snapshot(*self.getDate(h0))
     shape = (N,) + f.shape
     if hasattr(f,'mask'):
         f = ma.zeros(shape,dtype=float)
     else:
         f = zeros(shape,dtype=float)
     meter = ProgressMeter(total=N)
     for l in range(N):
         meter.update(1)
         f[l] = self.snapshot(*self.getDate(h0)) 
         h0 += 6
     return f
Example #10
0
    def __call__(self, field, lon, lat):
        """
        input field on regular lat-lon grid
        output field on regular projection grid
        """
        if len(field.shape) == 2:
            field, lon = basemap.addcyclic(field, lon)
            field, lon = basemap.shiftgrid(180, field, lon, start=False)
            self.field = self.m.transform_scalar(field, lon, lat, self.nx,
                                                 self.ny)

        elif len(field.shape) == 3:
            n = field.shape[0]
            self.field = np.zeros((n, self.ny, self.nx), dtype='f')
            for l in range(n):
                field1, lon1 = basemap.addcyclic(field[l], lon)
                field1, lon1 = basemap.shiftgrid(180,
                                                 field1,
                                                 lon1,
                                                 start=False)
                self.field[l] = self.m.transform_scalar(
                    field1, lon1, lat, self.nx, self.ny)

        elif len(field.shape) == 4:
            n0 = field.shape[0]
            n1 = field.shape[1]
            if hasattr(field, 'mask'):
                self.field = np.ma.zeros((n0, n1, self.ny, self.nx),
                                         dtype=float)
            else:
                self.field = np.zeros((n0, n1, self.ny, self.nx), dtype=float)
            print 'LambertProjector: Projecting 4D field'
            m = ProgressMeter(total=n0 * n1)
            for l0 in range(n0):
                for l1 in range(n1):
                    field1, lon1 = basemap.addcyclic(field[l0, l1], lon)
                    field1, lon1 = basemap.shiftgrid(180,
                                                     field1,
                                                     lon1,
                                                     start=False)
                    self.field[l0, l1] = self.m.transform_scalar(
                        field1, lon1, lat, self.nx, self.ny)
                    m.update(1)

        return self.field
Example #11
0
def get_category_recursive(cat_name, count=None):
    ret = set()
    seen_cats = set()

    if count is None:
        count = ALL
        print 'Recursively getting all members of', cat_name
    else:
        print 'Recursively getting',count,'members of', cat_name

    jobs = []
    api_pool = Pool(CAT_CONC)
    jobs.append(api_pool.spawn(get_category, cat_name, count))
    dpm = ProgressMeter(total=count, unit="categories", ticks=30)
    while len(ret) < count and jobs:
        cur_count = count - len(ret)
        api_pool.join(timeout=0.3, raise_error=True)
        for j in jobs:
            if not j.ready():
                continue
            jobs.remove(j)
            if not j.successful():
                print 'failed a cat fetch'
                continue
            cur_mems = j.value
            for m in cur_mems:
                if m.ns == 14:
                    if m.title not in seen_cats:
                        jobs.append(api_pool.spawn(get_category, m.title, cur_count))
                        seen_cats.add(m.title)
                else:
                    ret.add(m)
                    dpm.update(1)

    dpm.update(count - len(ret))
    
    ret = list(ret)[:count]
    print 'Done, returning', len(ret),'category members.'
    return list(ret)
def Seasonal(Field='U', Season='DJF', Source='ERA40', \
             YearStart=None, YearStop=None):
    # instatiate data server
    data = DataServer(Field=Field,Source=Source)
    if YearStart is None: YearStart = data.FirstYear
    if YearStop is None: YearStop = data.LastYear
    assert YearStart >= data.FirstYear,\
                       '\nFirst year in dataset is %s' % data.FirstYear
    assert YearStop <= data.LastYear,\
                       '\nLast year in dataset is %s' % data.LastYear
    # create output file
    FileName = '%s.%s.%s.%s-%s.nc' % (Field,Season,Source,YearStart,YearStop)
    File = CreateOutputFile(FileName,data)
    print 'Creating %s'%FileName
    TimeIndex = 0
    meter = ProgressMeter(total=YearStop-YearStart+1)
    for Year in range(YearStart,YearStop+1):
        meter.update(1)
        # get 1 season of data
        SeasonData = data.getSeason(Year,Season)
        File.variables['time'][TimeIndex]  = float(Year)
        File.variables[Field][TimeIndex] = SeasonData.astype('f')
        TimeIndex += 1
    File.close()
Example #13
0
def save_a_bunch(count=DEFAULT_LIMIT, category=DEFAULT_CAT, concurrency=DEFAULT_CONC, 
                 per_call=DEFAULT_PER_CALL, db_name=DEFAULT_DB):
    import time

    page_ids = get_dab_page_ids(category, count)

    dabblets = []
    dpm = ProgressMeter(total=len(page_ids), unit="articles", ticks=30)
    for pages in chunked_pimap(get_articles, page_ids,
                               concurrency=concurrency,
                               chunk_size=per_call):
        for p in pages:
            dpm.update(1)
            cur_dabs = get_dabblets(p)
            dabblets.extend(cur_dabs)
    
    print
    print 'Saving', len(dabblets), 'dabblets.'
    dspm = ProgressMeter(total=len(dabblets), unit="dabblets", ticks=30)
    dsave_start = time.time()
    for d in dabblets:
        d.save()
        for img in d.source_imgs:
            dab_img = DabImage(dabblet=d, src=img)
            dab_img.save()
        dspm.update(1)
    print
    print 'Done saving', len(dabblets), 'Dabblets. (', time.time()-dsave_start,'seconds)'

    print 'Processing choices for', len(dabblets), 'Dabblets.'
    cpm = ProgressMeter(total=len(page_ids), unit="Dabblets", ticks=30)
    all_choices = []
    for choices in chunked_pimap(get_dab_choices, dabblets,
                                 concurrency=concurrency,
                                 chunk_size=per_call):
        cpm.update(per_call)
        all_choices.extend(choices)
    
    print
    print 'Saving', len(all_choices), 'DabChoices.'
    cspm = ProgressMeter(total=len(all_choices), unit="DabChoices", ticks=30)
    csave_start = time.time()
    for c in all_choices:
        c.save()
        cspm.update(1)
    print 'Done saving', len(dabblets), 'DabChoices. (', time.time()-csave_start,'seconds)'

    drank_start = time.time()
    print 'Ranking', len(dabblets), 'Dabblets.'
    for d in dabblets:
        d.priority = d.get_priority()
        d.save()
    print 'Done ranking', len(dabblets), 'DabChoices. (', time.time()-drank_start,'seconds)'

    print len(set([d.title for d in dabblets])), 'unique titles'
    print len(set([d.source_title for d in dabblets])), 'unique source pages'
    print len(all_choices), 'dabblet choices fetched and saved.'

    print Dabblet.select().count(), 'total records in database'
    print len(set([d.title for d in Dabblet.select()])), 'unique titles in database'

    print 'Committing...'
    return dabblets