def run(input_file, output_file, raw, long, lat, time_interval, period, area_limit): PERIOD_IN_SECONDS = period * 24 * 60 * 60 TIME_INTERVAL = 60 * 60 * time_interval lines = input_file.readlines() records = [] def get_longtitude_after_period(long): return long + 180 * period / 14 - 360 def timestamp_from_date(date_s): return time.mktime( datetime.strptime(date_s, '%Y-%m-%d %H:%M:%S').timetuple()) for i in range(len(lines)): record = common.Record() record.__dict__.update(json.loads(lines[i])) try: record.time = timestamp_from_date(record.time) except: continue records.append(record) records.sort(key=lambda record: record.time) times = [x.time for x in records] sunspots = common.getSunspotsFromRecords(records) def is_same(first_record, second_record): long_diff = abs( get_longtitude_after_period(first_record.longtitude) - second_record.longtitude) lat_diff = abs(first_record.latitude - second_record.latitude) return long_diff < long and lat_diff < lat and \ (long_diff / long) ** 2 + (lat_diff / lat) ** 2 <= 1 spots_to_check = [ sunspot for sunspot in sunspots if sunspot.records[-1].longtitude >= (90 - 180 / 14) and sunspot.area >= area_limit ] for spot in spots_to_check: record = spot.records[-1] left = bisect.bisect_left( times, record.time + PERIOD_IN_SECONDS - TIME_INTERVAL) right = bisect.bisect_left( times, record.time + PERIOD_IN_SECONDS + TIME_INTERVAL) for record2 in records[left:right]: if not record2.old and is_same(record, record2): if raw: output_file.write( str(record.group_id) + ' ' + str(record2.group_id) + '\n') record2.old = True record2.previous_id = record.group_id break if not raw: for record in records: output_file.write(json.dumps(record.__dict__) + "\n")
def get_hash(self, kw, random_=0, limit_=None, sample_=None): cur = self.con.cursor() cmd = "select * from " + self.tname + " where id>=0" values = [] for k, v in kw.items(): cmd += " and %s=?" % k conv = self.converters.get(k, None) if conv is not None: v = conv.pickle(v) values += [v] if sample_: cmd += " and id%%%d=abs(random())%%%d" % (sample_, sample_) if random_: cmd += " order by random()" if limit_ is not None: cmd += " limit %d" % limit_ if self.verbose: print "#", cmd, values if debug: print cmd for row in cur.execute(cmd, values): result = common.Record() for k in row.keys(): conv = self.converters.get(k, None) v = row[k] if conv is not None: v = conv.unpickle(v) setattr(result, k, v) yield result cur.close() del cur
def run(input_file, output_file, raw, angle_step, area_step_low, area_limit_low, area_step_high, area_limit_high): AREA_STEP = area_step_high AREA_LIMIT = area_limit_high def getFrame(arr, angle_limit, area_limit): return [ x for x in arr if not x.filtered and not x.old and (abs(x.records[0].longtitude) >= angle_limit - angle_step) and ( abs(x.records[0].longtitude) < angle_limit) and ( x.area >= area_limit - AREA_STEP) and (x.area < area_limit) ] lines = input_file.readlines() records = [] for i in range(len(lines)): record = common.Record() record.__dict__.update(json.loads(lines[i])) records.append(record) sunspots = common.getSunspotsFromRecords(records) random.shuffle(sunspots) left = [x for x in sunspots if x.records[0].longtitude <= 0] right = [x for x in sunspots if x.records[0].longtitude > 0] def apply_filter(): for j in range(1, AREA_LIMIT // AREA_STEP + 1): area_limit = AREA_STEP * j for i in range(1, 90 // angle_step + 1): angle_limit = i * angle_step count = max( 0, len(getFrame(left, angle_limit, area_limit)) - len(getFrame(right, angle_limit, area_limit))) for x in getFrame(left, angle_limit, area_limit)[:count]: x.filtered = True apply_filter() AREA_STEP = area_step_low AREA_LIMIT = area_limit_low apply_filter() for x in left: if not x.filtered: if raw: output_file.write(str(x.id) + "\n") else: for y in x.records: output_file.write(json.dumps(y.__dict__) + "\n") for x in right: if raw: output_file.write(str(x.id) + "\n") else: for y in x.records: output_file.write(json.dumps(y.__dict__) + "\n")
def run(input_file, output_file): for line in input_file.readlines(): if len(line) < 74: continue record = common.Record() record.group_id = line[12:20] try: record.group_id = int(record.group_id) except: continue record.latiitude = float(line[63:68]) record.longtitude = float(line[57:62]) record.time = formatDate(line[:4], line[4:6], line[6:8], line[9:12]) record.area = int(line[40:44]) output_file.write(json.dumps(record.__dict__) + '\n')
def run(input_file, output_file): lines = input_file.readlines() progress = 0 for i in range(len(lines)): record = common.Record() record.__dict__.update(json.loads(lines[i])) coord = SkyCoord( lat=record.latitude * u.deg, lon=record.longtitude * u.deg, obstime=record.time, frame=frames.HeliographicCarrington) record.longtitude = coord.transform_to(frames.HeliographicStonyhurst).lon.deg percents_completed = i * 1000 // len(lines) if percents_completed != progress: progress = percents_completed print("Processing: " + str(progress / 10) + "%") output_file.write(json.dumps(record.__dict__) + "\n")
input_file2 = open("data/data.csv", "r") numberByDate = dict() for line in input_file2.readlines(): (year, month, day, skip, number) = line.split(";")[:5] if int(year) > 2013 or int(year) < 1900: continue numberByDate[year + "-" + month + "-" + day] = (int(number), 0, 0) plt.plot(to_average([i[0] for i in numberByDate.values()]), label='Daily total sunspot number') input_file = open("data/rgofull.marked.json", "r") records = [] for line in input_file.readlines(): records.append(common.Record()) records[-1].__dict__.update(json.loads(line)) sunspots = common.getSunspotsFromRecords(records) for sunspot in sunspots: if not sunspot.old: continue for record in sunspot.records: date = datetime.utcfromtimestamp(record.time).strftime('%Y-%m-%d') if record.latitude >= 0: numberByDate[date] = (numberByDate[date][0], numberByDate[date][1] + record.area, numberByDate[date][2]) else: numberByDate[date] = (numberByDate[date][0], numberByDate[date][1], numberByDate[date][2] + record.area)
def recognizeLineSeg(self, image): """Recognize a line. lattice: result of recognition rseg: intarray where the raw segmentation will be put image: line image to be recognized""" # first check whether the input dimensions are reasonable if image.shape[0] < 10: raise RecognitionError( "line image not high enough (maybe rescale?)", image=image) if image.shape[0] > 200: raise RecognitionError("line image too high (maybe rescale?)", image=image) if image.shape[1] < 10: raise RecognitionError( "line image not wide enough (segmentation error?)", image=image) if image.shape[1] > 10000: raise RecognitionError("line image too wide???", image=image) # FIXME for some reason, something down below # depends on this being a bytearray image, so # we're normalizing it here to that type image = array(image * 255.0 / amax(image), 'B') # compute the raw segmentation rseg = self.segmenter.charseg(image) if self.debug: show_segmentation(rseg) # FIXME rseg = renumber_labels(rseg, 1) # FIXME if amax(rseg) < self.minsegs: raise RecognitionError("not enough segments in raw segmentation", rseg=rseg) # self.grouper = grouper.Grouper() self.grouper.setSegmentation(rseg) # compute the geometry (might have to use # CCS segmenter if this doesn't work well) geo = docproc.seg_geometry(rseg) # compute the median segment height heights = [] for i in range(self.grouper.length()): (y0, x0, y1, x1) = self.grouper.boundingBox(i) heights.append(y1 - y0) mheight = median(array(heights)) if mheight < 8: raise RecognitionError( "median line height too small (maybe rescale prior to recognition)", mheight=mheight) if mheight > 100: raise RecognitionError( "median line height too large (maybe rescale prior to recognition)", mheight=mheight) self.mheight = mheight # invert the input image (make a copy first) old = image image = amax(image) - image # initialize the whitespace estimator self.whitespace.setLine(image, rseg) # this holds the list of recognized characters if keep!=0 self.chars = [] # now iterate through the characters for i in range(self.grouper.length()): # get the bounding box for the character (used later) (y0, x0, y1, x1) = self.grouper.boundingBox(i) # compute relative geometry aspect = (y1 - y0) * 1.0 / (x1 - x0) try: rel = docproc.rel_char_geom((y0, y1, x0, x1), geo) except: traceback.print_exc() raise RecognitionError("bad line geometry", geo=geo) ry, rw, rh = rel assert rw > 0 and rh > 0, "error: rw=%g rh=%g" % (rw, rh) rel = docproc.rel_geo_normalize(rel) # extract the character image (and optionally display it) (raw, mask) = self.grouper.extractWithMask(image, i, 1) char = raw / 255.0 if self.debug: imshow(char) raw_input() # Add a skip transition with the pixel width as cost. # This ensures that the lattice is at least connected. # Note that for typical character widths, this is going # to be much larger than any per-charcter cost. if self.add_rho: self.grouper.setClass(i, ocrofst.L_RHO, self.rho_scale * raw.shape[1]) # compute the classifier output for this character # FIXME parallelize this outputs = self.cmodel.coutputs(char, geometry=rel) outputs = [(x[0], -log(x[1])) for x in outputs] self.chars.append( common.Record(index=i, image=char, outputs=outputs)) # estimate the space cost sc = self.whitespace.classifySpace(x1) yes_space = min(self.maxspacecost, -log(sc[1])) no_space = min(self.maxspacecost, -log(sc[0])) # maybe add a transition on "_" that we can use to skip # this character if the transcription contains a "~" self.grouper.setClass(i, "~", self.reject_cost) # add the top classes to the lattice outputs.sort(key=lambda x: x[1]) for cls, cost in outputs[:self.nbest]: # don't add anything with a cost above maxcost # if cost>self.maxcost and cls!="~": continue if cls == "~": continue if cls in self.debug_cls: print "debug",self.grouper.start(i),self.grouper.end(i),"cls",cls,"cost",cost,\ "y %.2f w %.2f h %.2f"%(rel[0],rel[1],rel[2]) # letters are never small, so we skip small bounding boxes that # are categorized as letters; this is an ugly special case, but # it is quite common category = unicodedata.category(unicode(cls[0])) if (y1 - y0) < self.min_height * mheight and category[0] == "L": # add an empty transition to allow skipping junk # (commented out right now because I'm not sure whether # the grouper can handle it; FIXME) # self.grouper.setClass(i,"",1.0) continue if type(cls) == int: assert self.allow_any or (cls>=0 and cls<0x110000),\ "classifier returned non-unicode class: %s"%(hex(cls),) elif type(cls) == str: assert len(cls)<4,\ ("classifier returned too many chars: %s",cls) # for anything else, just add the classified character to the grouper if type(cls) == str or type(cls) == unicode: self.grouper.setClass(i, cls, cost) elif type(cls) == int: assert cls >= 0 and cls < 0x110000, "bad class: %s" % ( hex(cls), ) self.grouper.setClass(i, cls, cost) else: raise Exception("bad class type: %s" % type(cls)) self.grouper.setSpaceCost(i, float(yes_space), float(no_space)) # extract the recognition lattice from the grouper lattice = self.grouper.getLattice() # return the raw segmentation as a result return lattice, rseg
def run(input_file, long, lat, time_interval): TIME_INTERVAL = 60 * 60 * time_interval def get_longtitude_after_period(long): return long + 180 * PERIOD_IN_DAYS / 14 def timestamp_from_date(date_s): return time.mktime(datetime.strptime(date_s, '%Y-%m-%d %H:%M:%S').timetuple()) lines = input_file.readlines() records = [] for i in range(len(lines)): record = common.Record() record.__dict__.update(json.loads(lines[i])) try: record.time = timestamp_from_date(record.time) except: continue records.append(record) records.sort(key=lambda record: record.time) times = [x.time for x in records] sunspots = common.getSunspotsFromRecords(records) def mark_and_get_accuracy(long_interval, lat_interval): def is_same(first_record, second_record): long_diff = abs(get_longtitude_after_period(first_record.longtitude) - second_record.longtitude) lat_diff = abs(first_record.latitude - second_record.latitude) return long_diff < long_interval and lat_diff < lat_interval and \ (long_diff / long_interval)**2 + (lat_diff / lat_interval)**2 <= 1 def get_score(first_record, second_record): r = math.sqrt(second_record.area) / 2 lat_l = max(second_record.latitude - r, first_record.latitude - lat_interval) lat_r = min(second_record.latitude + r, first_record.latitude + lat_interval) long_l = max(second_record.longtitude - r, get_longtitude_after_period(first_record.longtitude) - long_interval) long_r = min(second_record.longtitude + r, get_longtitude_after_period(first_record.longtitude) + long_interval) return max(0, (lat_r - lat_l)) * max(0, (long_r - long_l)) for record in records: record.marked = False long_live_sunspots = [sunspot for sunspot in sunspots if sunspot.records[-1].time - sunspot.records[0].time >= PERIOD_IN_SECONDS] score = 0 count = 0 for spot in long_live_sunspots: record = spot.records[0] left = bisect.bisect_left(times, record.time + PERIOD_IN_SECONDS - TIME_INTERVAL) right = bisect.bisect_left(times, record.time + PERIOD_IN_SECONDS + TIME_INTERVAL) for record2 in records[left:right]: if not record2.marked and is_same(record, record2): record2.marked = True score += get_score(record, record2) count += 1 break return score / (lat_interval * long_interval) for j in range(7, 14): PERIOD_IN_DAYS = j PERIOD_IN_SECONDS = PERIOD_IN_DAYS * 24 * 60 * 60 temp = [] for i in range(1, 21): t = 1 + (i - 10) / 10 long_interval = t * long lat_interval = t * lat temp.append(mark_and_get_accuracy(long_interval, lat_interval)) plt.plot(temp, label=str(j) + ' days') plt.legend() plt.xlabel('ellipse linear size coefficient') plt.ylabel('accuracy score') plt.xticks([i for i in range(20)], [str(1.15 + (i - 10) / 10)[:3] for i in range(20)], rotation=-45) plt.grid() plt.show()
#!/usr/bin/python3 import json import common main_file = open("data/rgofull.json", "r") merging_file = open("data/rgofull.hsc.json", "r") lines = main_file.readlines() records = [] for i in range(len(lines)): record = common.Record() record.__dict__.update(json.loads(lines[i])) records.append(record) lines = merging_file.readlines() for i in range(len(lines)): record = common.Record() record.__dict__.update(json.loads(lines[i])) records[i].longtitude = record.longtitude main_file.close() output_file = open("data/rgofull.hsc.json", "w") for record in records: output_file.write(json.dumps(record.__dict__) + '\n')