def crear_trabajo(cron, date, hour, gramos): date=date.strip("/").split() hour=date.strip(":").split() job = cron.new(command='python archivos.py '+gramos) job.setall(datetime(int(str(date.today().year)), date[0], date[1], hour[0], hour[1])) job.enable() cron.write_to_user(user="******")
def _get_case_dates(self): path = "{base}/td[1]/text()".format(base=self.base) return [ convert_date_string(date.strip()) for date in self.html.xpath(path) if date.strip() ]
def scrapeBaramDom(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader('http://www.baramdom.com/') content = down.get_content() html = unicode(content) p = xpath.get(html, '//div[@class="box post"]') linkovi = xpath.search(p, '//div[@class="content"]') ads = [] for l in linkovi: link = "http://www.baramdom.com" + xpath.get(l, '//div[@class="post-title"]/h2/a/@href') title = xpath.get(l, '//div[@class="post-title"]/h2/a') imageUrl = xpath.get(l, '//a[@class="grouped"]/img/@src') if imageUrl == "": imageUrl = "http://www.baramdom.com/img/apartment_noimage.png" else: imageUrl = "http://www.baramdom.com" + imageUrl download = Downloader(link) cont = download.get_content() cont = unicode(cont) description = xpath.get(cont, '//p[@class="post_add_desc"]') description = description.strip() category = u"Недвижнини" ost = xpath.get(l, '//p[@class="add-title"]') ost = ost.strip() ost = ost.split(" во ") region = ost[1] country = u"Македонија" k = ost[0] k = k.split("ам ") subcategory = k[1] price = xpath.get(cont, '//div[@class="post-add"]/p[@class="last"]').strip() price = price.split(" ") if len(price)==3: value = "/" currency = "/" else: value = price[0] currency = price[1] if currency == "Euro.": currency = "EUR" elif currency == u"Ден.": currency = "MKD" date = xpath.get(l, '//div[@class="fl"]') date = date.strip() date = date.split(">") date = date[1] date = date.strip() date = date.split(" ") date = date[0] date = date.split("-") date = date[2]+"-"+date[1]+"-"+date[0] ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapeBaramDom()
def date_fetcher(self): self.dates = [] self.date_index = [] for line in self.raw_url: #looks for the first sign of a date #and removes emptpy spaces on either side if """<td class="col1">""" in line: date = line.strip(" ") #ensures it is a date with a simple number check at an index #appends the final dates to a list #also records the index of each date for finding the related metric if line[-6] == "0" or line[-6] == "1" or line[-6] == "2" or line[-6] == "3" or line[-6] == "4" or line[-6] == "5" or line[-6] == "6" or line[-6] == "7" or line[-6] == "8" or line[-6] == "9": date = date[17:] date = date.strip("</td>") self.dates.append(date) self.date_index.append(self.raw_url.index(line)) else: pass else: pass self.dates = self.dates[0:35] return self.dates
def __init__(self,well_id, name, date, gor, wc, oil, gas, water, injection, oil_cum, gas_cum, water_cum, injection_cum, liquid_form, liquid_form_cum, gas_form, gas_form_cum, injection_form, injection_form_cum, work_time, work_time_cum): self.well_id = well_id.strip() self.name = name.strip() try: self.date = date.strip() self.date = datetime.strptime(date, '%Y%m%d') except: self.date=date self.gor = float(gor) self.wc = float(wc) self.oil = float(oil) self.gas = float(gas) self.water = float(water) self.injection = float(injection) self.oil_cum = float(oil_cum) self.gas_cum = float(gas_cum) self.water_cum = float(water_cum) self.injection_cum = float(injection_cum) self.liquid_form = float(liquid_form) self.liquid_form_cum = float(liquid_form_cum) self.gas_form = float(gas_form) self.gas_form_cum = float(gas_form_cum) self.injection_form = float(injection_form) self.injection_form_cum=float(injection_form_cum) self.work_time = float(work_time) self.work_time_cum = float(work_time_cum)
def update_serie(sender, instance, **kwargs): serie_id = instance.imdb_id page = requests.get('http://www.imdb.com/title/' + serie_id) tree = html.fromstring(page.text) saison_list = tree.xpath('//*[@id="title-episode-widget"]/div/div[3]/a/text()') for saison in saison_list: if str(saison).isdigit(): new_saison = Saison.objects.get(movie=instance, saison_number=saison) if new_saison is None: new_saison = Saison(movie=instance, saison_number=saison) new_saison.save() page = requests.get('http://www.imdb.com/title/' + serie_id + '/episodes?season=' + saison) tree = html.fromstring(page.text) episode_list_name = tree.xpath('//*[@itemprop="episodes"]/strong[1]/a/text()') episode_list_date = tree.xpath('//*[@itemprop="episodes"]/div[1]/text()') for name, date in zip(episode_list_name, episode_list_date): release_date = None # todo make a lot better try: release_date = datetime.strptime(date.strip(), '%d %b. %Y') except ValueError: pass try: release_date = datetime.strptime(date.strip(), '%d %b. %Y') except ValueError: pass # todo make a lot better (utilisé un get???) episodes = Episode.objects.filter(saison=new_saison, episode_name=name) # todo make a lot better if episodes.__len__() == 0: episode = Episode(saison=new_saison, episode_name=name) else: # todo make a lot better (juste ne pas faire ca [0]) episode = episodes[0] episode.release_date = release_date episode.save()
def InsertFoundationExchange(stockID, ForeignInvestorBuy, ForeignInvestorSell, \ InvestmentTrustBuy, InvestmentTrustSell, \ DealerBuy, DealerSell, TotalVolume, Category, date): if (stockID.strip()=="" or \ ForeignInvestorBuy.strip()=="" or ForeignInvestorSell.strip()=="" or \ InvestmentTrustBuy.strip()=="" or InvestmentTrustSell.strip()=="" or \ DealerBuy.strip()=="" or DealerSell.strip()=="" or \ TotalVolume.strip()=="" or Category.strip()=="" or date.strip()==""): dbgPrint("InsertFoundationExchange: Parameters cannot be empty") return(-1) try: valid_date(date) # Get CoId cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,)) row = cursor.fetchall() if(cursor.rowcount <= 0): dbgPrint("InsertFoundationExchange: Error: Cannot locate Company ID" + str(stockID) +":"+ str(cursor.rowcount)) return(-1) # check for duplicate, i.e. same coID, same date and same category if(check_record(str(row[0][0]), date, Category, "FoundationExchange") != 0): dbgPrint("InsertFoundationExchange (Error): Record already exist:: coID[" + str(row[0][0]) + "] date[" + str(date) + "] Category[" + str(Category) + "]") return(-1) add_fs = ("INSERT INTO FoundationExchange (CoId, ForeignInvestorBuy, ForeignInvestorSell, " \ "InvestmentTrustBuy, InvestmentTrustSell, DealerBuy, DealerSell, TotalVolume, Category, date) " \ "VALUES (%(_coid)s, %(_foreigninvestorbuy)s, %(_foreigninvestorsell)s, %(_investmenttrustbuy)s, %(_investmenttrustsell)s, " \ "%(_dealerbuy)s, %(_dealersell)s, %(_totalvolume)s, %(_category)s, %(_date)s)") data_fs = { '_coid' : int(row[0][0]), '_foreigninvestorbuy' : int(ForeignInvestorBuy), '_foreigninvestorsell' : int(ForeignInvestorSell), '_investmenttrustbuy' : int(InvestmentTrustBuy), '_investmenttrustsell' : int(InvestmentTrustSell), '_dealerbuy' : int(DealerBuy), '_dealersell' : int(DealerSell), '_totalvolume' : int(TotalVolume), '_category' : int(Category), '_date': date,} cursor.execute(add_fs, data_fs) db.commit() except mcon.Error as err: dbgPrint("FoundationExchange: DB Error [" + str(err) + "] ") return(-1) except Exception as e: dbgPrint("FoundationExchange (ERROR): Exception") dbgPrint(e) raise Exception dbgPrint("FoundationExchange: Insert Completed: " + str(data_fs)) return(0)
def get_milestone_date(milestone): date = '' ndx = milestone.find(' ') + 1 if ndx > -1: date = milestone[ndx:ndx + 11] # date must be in format 2020-01-01 date = date.strip() dateparts = [] if date: dateparts = date.split('-') try: int(dateparts[0]) int(dateparts[1]) int(dateparts[2]) except Exception: date = '' #invalid date for milestone pass return date.strip()
def isodate(date): try: return datetime.strptime(date.strip(), '%Y-%m-%dT%H:%M:%S.%f') except ValueError: try: return datetime.strptime(date.strip(), '%Y-%m-%dT%H:%M:%S') except ValueError: try: return datetime.strptime(date.strip(), '%Y-%m-%d') except ValueError: try: return datetime.strptime(date.strip(), '%Y-%m') except ValueError: try: return datetime.strptime(date.strip(), '%Y') except ValueError: log.error('date could not be decoded: %s' % date) return None
def parse_movies(self, div, schedule): items = [] localtz = timezone('Europe/Lisbon'); for movie in schedule.select('div[@class="%s"]' % div): title = movie.select('div[@class="infoTitleProg"]/text()').extract()[0] desc = "" try: #realizador infoBiblio = movie.select('div[@class="infoBiblio"]/text()').extract() #0 - realizador director = infoBiblio[0].strip(' \t\n\r') #1 - actores actors = infoBiblio[1].strip(' \t\n\r') #2 - pais, data - tempo reg = re.compile(r'(?P<country>.*?),' r' +' r'(?P<year>\d{4})' r' +- +' r'(?P<duration>\d{,3})') temp = reg.match(infoBiblio[2]) country = temp.group('country') year = int(temp.group('year')) duration = int(temp.group('duration')) except Exception, e: pass for info in movie.select('div[@class="infoText"]/p/text()').extract(): desc += info.strip(' \t\n\r') desc += '\n' dates_locations = movie.select('div[@class="infoDate"]/text()').re('\d{,2}-\d{,2}-\d{4}, \d{,2}h\d{,2} \| .*') for date_location in dates_locations: item = ScheduleItem() # dates are stored in Lisbon utc date, location = date_location.split("|") date_obj = datetime.strptime(date.strip(' \t\n\r'), '%d-%m-%Y, %Hh%M') date_obj_aware = localtz.localize(date_obj); item['date'] = date_obj_aware item['title'] = title.strip(' \t\n\r') item['location'] = location.strip(' \t\n\r') try: item['director'] = director item['actors'] = actors item['country'] = country item['year'] = year item['duration'] = duration except NameError: pass item['desc'] = desc.strip(' \t\n\r') items.append(item)
def messageList(id): subject = subjects.eq(id) author, date = subject.parents("td.printhead").eq(0).next().text().split(",", 1) messages[id] = { "hash" : fromQueryString(subject.attr("href"), "mopen"), "subject": subject.text(), "author": author.strip().replace("von ", ""), "date": date.strip()} if not all: print self.asciiout.trim("["+str(id)+"] "+messages[id]["author"]+": "+subject.text())
def parse_pro(self, response): sel = Selector(response) codelist = sel.xpath( '//meta[re:test(@name,"Keywords")]/@content').extract() code = str(codelist[0]).split(')')[0].split('(')[-1] stock_info = StockInfoItem() stock_info['code'] = code name = u"组织形式" stock_list = sel.xpath( '//div/table/tr/td[re:test(text(),"%s")]/following-sibling::*/text()' % (name)).extract() ownership = "" for stock in stock_list: ownership += str(stock.strip()) if ownership != "": stock_info['ownership'] = ownership name = u"成立日期" stock_date = sel.xpath( '//div/table/tr/td[re:test(text(),"%s")]/following-sibling::*[1]//text()' % (name)).extract() found_date = "" for date in stock_date: found_date += date.strip() if found_date != "": stock_info['found_date'] = found_date name = u"上市日期" market_list_date = "" stock_date = sel.xpath( '//div/table/tr/td[re:test(text(),"%s")]/following-sibling::*[1]//text()' % (name)).extract() for date in stock_date: market_list_date += date.strip() if market_list_date != "": stock_info['market_list_date'] = market_list_date yield stock_info
def InsertMonthlyRevenue(stockID, MonthlyRevenue, LastMonthlyRevenue, LastYearMonthlyRevenue,\ MonthlyIncreaseRevenue, LastYearMonthlyIncreaseRevenue, \ CumulativeRevenue, LastYearCumulativeRevenue, CompareCumulativeRevenue,date): if (stockID.strip()=="" or \ MonthlyRevenue.strip()=="" or LastMonthlyRevenue.strip()=="" or \ LastYearMonthlyRevenue.strip()=="" or MonthlyIncreaseRevenue.strip()=="" or \ LastYearMonthlyIncreaseRevenue.strip()=="" or CumulativeRevenue.strip()=="" or \ LastYearCumulativeRevenue.strip()=="" or CompareCumulativeRevenue.strip()=="" or \ date.strip()==""): dbgPrint("InsertMonthlyRevenue: Parameters cannot be empty") return(-1) try: valid_date(date) # Get CoId cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,)) row = cursor.fetchall() if(cursor.rowcount <= 0): dbgPrint("InsertMonthlyRevenue: Error: Cannot locate Company ID" +str(stockID) +":"+ str(cursor.rowcount)) return(-1) if(check_record(str(row[0][0]), date, "", "MonthlyRevenue") != 0): dbgPrint("InsertMonthlyRevenue: Error: Record already exist, please make sure no duplicates") return(-1) print("insert db") add_fs = ("INSERT INTO MonthlyRevenue (CoId, MonthlyRevenue, LastMonthlyRevenue, LastYearMonthlyRevenue, MonthlyIncreaseRevenue, " \ "LastYearMonthlyIncreaseRevenue, CumulativeRevenue, LastYearCumulativeRevenue, CompareCumulativeRevenue, date) " \ "VALUES (%(_coid)s, %(_monthlyrevenue)s, %(_lastmonthlyrevenue)s, %(_lastyearmonthlyrevenue)s, %(_monthlyincreaserevenue)s, " \ "%(_lastyearmonthlyincreaserevenue)s, %(_cumulativerevenue)s, %(_lastyearcumulativerevenue)s, %(_comparecumulativerevenue)s, %(_date)s)" ) data_fs = { '_coid' : int(row[0][0]), '_monthlyrevenue' : int(MonthlyRevenue), '_lastmonthlyrevenue' : int(LastMonthlyRevenue), '_lastyearmonthlyrevenue' : int(LastYearMonthlyRevenue), '_monthlyincreaserevenue' : float(MonthlyIncreaseRevenue), '_lastyearmonthlyincreaserevenue': float(LastYearMonthlyIncreaseRevenue), '_cumulativerevenue' : int(CumulativeRevenue), '_lastyearcumulativerevenue' : int(LastYearCumulativeRevenue), '_comparecumulativerevenue' : float(CompareCumulativeRevenue), '_date': date,} cursor.execute(add_fs, data_fs) db.commit() except mcon.Error as err: dbgPrint("InsertMonthlyRevenue: Connect to DB Error [" + str(err) + "] ") return(-1) dbgPrint("InsertMonthlyRevenue: Insert Completed: " + str(data_fs)) return(0)
def InsertStockExchange(stockID, ExchangeVolume, StartPrice, HighPrice, LowPrice, EndPrice, Category, date): if (stockID.strip()=="" or \ ExchangeVolume.strip()=="" or StartPrice.strip()=="" or \ HighPrice.strip()=="" or LowPrice.strip()=="" or \ EndPrice.strip()=="" or date.strip()==""): dbgPrint("InsertStockExchange: Parameters cannot be empty") return(-1) if ((stockID.lstrip('-+').isdigit() == False) and (not isfloat(stockID))): dbgPrint("InsertCalStatement: stockID must be a digit") return(-1) else: # Convert stockID from float to in then string stockID = str(int(float(stockID))) try: valid_date(date) # Get CoId cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,)) row = cursor.fetchall() if(cursor.rowcount <= 0): dbgPrint("InsertStockExchange: Error: Cannot locate Company ID" +str(stockID) +":"+ str(cursor.rowcount)) return(-1) if(check_record(str(row[0][0]), date, Category, "StockExchange") != 0): dbgPrint("InsertStockExchange: Error: Record already exist, please make sure no duplicates") return(-1) add_fs = ("INSERT INTO StockExchange " \ "(CoId, ExchangeVolume, StartPrice, HighPrice, LowPrice, EndPrice, Category, Date) " \ "VALUES (%(_coid)s, %(_exchangevolume)s, %(_startprice)s, %(_highprice)s, \ %(_lowprice)s, %(_endprice)s, %(_category)s, %(_date)s)") data_fs = { '_coid': int(row[0][0]), '_exchangevolume': int(ExchangeVolume), '_startprice': float(StartPrice), '_highprice': float(HighPrice), '_lowprice': float(LowPrice), '_endprice': float(EndPrice), '_category': int(Category), '_date': date,} cursor.execute(add_fs, data_fs) db.commit() except mcon.Error as err: dbgPrint("InsertStockExchange: Connect to DB Error [" + str(err) + "] ") return(-1) dbgPrint("InsertStockExchange: Insert Completed: " + str(data_fs)) return(0)
def date_trans(datestr): # format - June 29, 2019 def get_key(dict, value): return [k for k, v in dict.items() if v == value] # print(datestr) # print('@@@@@@@@@@@@@@@') month, date, year = datestr.split() date = date.strip(',') month = get_key(monthDict, month)[0] return str2date('{}-{}-{}'.format(year, month, date))
def format_date_us_history(strDate): if DateUtil.isVaildDate(strDate): return strDate tupDate = strDate.partition("|") chineseDate = tupDate[2] + ":00" date = str(chineseDate) date = date.replace("年", "-") date = date.replace("月", "-") date = date.replace("日", "") date = date.strip() return date
def get_date(self, response): matches = [ 'christophclarkonline', 'gapingangels', 'jakemalone', 'joeysilvera', 'lewood', 'nachovidalhardcore', 'povblowjobs', 'tittycreampies' ] if any(x in response.url for x in matches): date = response.xpath( '//script[contains(text(),"sceneReleaseDate")]').get() date = re.search('sceneReleaseDate\":\"(\\d{4}-\\d{2}-\\d{2})', date).group(1) else: date = self.process_xpath(response, self.get_selector_map('date')).getall() if len(date) > 1: for daterow in date: datetemp = "" daterow.replace('Released:', '').replace('Added:', '').rstrip().strip() if re.match('(\\d{4}-\\d{2}-\\d{2})', daterow): datetemp = re.search('(\\d{4}-\\d{2}-\\d{2})', daterow).group(1).strip() elif re.match('(\\d{2}-\\d{2}-\\d{4})', daterow): datetemp = re.search('(\\d{2}-\\d{2}-\\d{4})', daterow).group(1).strip() if not datetemp: date = datetemp.strip() matches = ['21sextreme'] if not date or any(x in response.url for x in matches): date = response.xpath( '//script[contains(text(),"sceneReleaseDate")]').getall() if len(date) > 1: for daterow in date: datetemp = re.search( 'sceneReleaseDate\":\"(\\d{4}-\\d{2}-\\d{2})', daterow) if datetemp: datetemp = datetemp.group(1) if datetemp: date = datetemp.strip() if not date: date = response.xpath( '//div[@class="updatedDate"]/b/following-sibling::text()').get( ) if not date: date = response.xpath( '//div[@class="updatedDate"]/b/following-sibling::text()').get( ) return self.parse_date(date.strip(), date_formats=['%m-%d-%Y', '%Y-%m-%d']).isoformat()
def conver_time_to_epoch(date, format=None): date = date.strip() if format: try: calendar.timegm(datetime.strptime(date, format).timetuple()) except: pass try: return calendar.timegm(datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timetuple()) except: pass return ''
def __init__(self, id, name, date, oil, gas, water, injection, work_time): self.id = id.strip() if not len(self.id) == 9: raise ValueError('Некорректный id скважины') self.name = name.strip() if isinstance(date, str): self.date = date.strip() self.date = datetime.strptime(date, '%d%m%Y') self.oil = float(oil) self.gas = float(gas) self.water = float(water) self.injection = float(injection) self.work_time = float(work_time)
def InsertMarginTrade(stockID, MarginBuy, MarginSell, MarginRemine, ShortSellBuy, \ ShortSellSell, ShortSellRemine, TotalVolume, ChargeOff, Category, date): if (stockID.strip()=="" or \ MarginBuy.strip()=="" or MarginSell.strip()=="" or \ MarginRemine.strip()=="" or ShortSellBuy.strip()=="" or \ ShortSellSell.strip()=="" or ShortSellRemine.strip()=="" or \ TotalVolume.strip()=="" or ChargeOff.strip()=="" or date.strip()==""): dbgPrint("InsertMarginTrade: Parameters cannot be empty") return(-1) try: valid_date(date) # Get CoId cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,)) row = cursor.fetchall() if(cursor.rowcount <= 0): dbgPrint("InsertMarginTrade: Error: Cannot locate Company ID" +str(stockID) +":"+ str(cursor.rowcount)) return(-1) if(check_record(str(row[0][0]), date, Category, "MarginTrading") != 0): dbgPrint("InsertMarginTrade: Error: Record already exist, please make sure no duplicates") return(-1) add_fs = ("INSERT INTO MarginTrading (CoId, MarginBuy, MarginSell, MarginRemine, ShortSellBuy, " \ "ShortSellSell, ShortSellRemine, TotalVolume, ChargeOff, Category, date) " \ "VALUES (%(_coid)s, %(_marginbuy)s, %(_marginsell)s, %(_marginremine)s, %(_shortsellbuy)s, " \ "%(_shortsellsell)s, %(_shortsellremine)s, %(_totalvolume)s, %(_chargeoff)s, %(_category)s, %(_date)s)") data_fs = { '_coid' : int(row[0][0]), '_marginbuy' : int(MarginBuy), '_marginsell' : int(MarginSell), '_marginremine' : int(MarginRemine), '_shortsellbuy' : int(ShortSellBuy), '_shortsellsell' : int(ShortSellSell), '_shortsellremine' : int(ShortSellRemine), '_totalvolume' : int(TotalVolume), '_chargeoff' : int(ChargeOff), '_category' : int(Category), '_date': date,} cursor.execute(add_fs, data_fs) db.commit() except mcon.Error as err: dbgPrint("InsertMarginTrade: Insert Error [" + str(err) + "] ") return(-1) dbgPrint("InsertMarginTrade: Insert Completed: " + str(data_fs)) return(0)
def __init__(self, id, name, date, bhp, buff_pressure, annular_pressure, line_pressure, form_pressure): self.id = id.strip() if not len(self.id) == 9: raise ValueError('Некорректный id скважины') self.name = name.strip() if isinstance(date, str): self.date = date.strip() self.date = datetime.strptime(date, '%d%m%Y') self.bhp = float(bhp) self.buff_pressure = float(buff_pressure) self.annular_pressure = float(annular_pressure) self.line_pressure = float(line_pressure) self.form_pressure = float(form_pressure)
def scraper_for_hours(date): """ date """ url = "http://menu.dining.ucla.edu/Hours" + "/" + date hours = {"hourDate": date.strip(), "hours": []} soup = BeautifulSoup(requests.get(url).text, "lxml") hour_table = soup.find("table", class_="hours-table") if hour_table == None: return hours elif hour_table.find("tbody") == None: return hours else: header_order_dict = {} counter = 0 # header for tr in hour_table.find("thead"): for td in tr.find_all("th"): if td.string.strip() == "": header_order_dict[str(counter)] = "hall_name" else: title = td.string.strip().lower() if "/" in title: header_order_dict[str(counter)] = title.split("/")[0] elif " " in title: header_order_dict[str(counter)] = title.replace( " ", "_") else: header_order_dict[str(counter)] = title counter = counter + 1 # data for tr in hour_table.find("tbody").find_all("tr"): counter = 0 dining_hall_dict = {} for td in tr.find_all("td"): if td.find("span") != None: dining_hall_dict[header_order_dict[str( counter)]] = td.find("span").string.strip() else: dining_hall_dict[header_order_dict[str( counter)]] = td.string.strip() counter = counter + 1 hours["hours"].append(dining_hall_dict) return hours
def conver_time_to_epoch(date, format=None): date = date.strip() if format: try: calendar.timegm(datetime.strptime(date, format).timetuple()) except: pass try: return calendar.timegm( datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timetuple()) except: pass return ''
def InsertFinancialStatement(stockID, asset, equity, date): if (stockID.strip()=="" or asset.strip()=="" or equity.strip()=="" or date.strip()==""): dbgPrint("InsertFinancialStatement: Parameters cannot be empty") return(-1) if ((stockID.lstrip('-+').isdigit() == False) and (not isfloat(stockID))): dbgPrint("InsertCalStatement: stockID must be a digit") return(-1) else: # Convert stockID from float to in then string stockID = str(int(float(stockID))) if not (asset.isdigit and equity.isdigit): dbgPrint("InsertFinancialStatement: stockID, asset and equity must be numbers") return(-1) try: valid_date(date) # Get CoId cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,)) row = cursor.fetchall() if(cursor.rowcount <= 0): dbgPrint("InsertFinancialStatement: Error: Cannot locate Company ID" + str(cursor.rowcount)) return(-1) if(check_record(str(row[0][0]), date, "", "FinancialStatement") != 0): dbgPrint("InsertFinancialStatement: Error: Record already exist, please make sure no duplicates") return(-1) add_fs = ("INSERT INTO FinancialStatement " \ "(CoId, TotalAsset, TotalEquity, Date) " \ "VALUES (%(_coid)s, %(_asset)s, %(_equity)s, %(_date)s)") data_fs = { '_coid': int(row[0][0]), '_asset': int(asset), '_equity': int(equity), '_date': date,} cursor.execute(add_fs, data_fs) db.commit() except mcon.Error as err: dbgPrint("InsertFinancialStatement: Connect to DB Error [" + str(err) + "] ") return(-1) dbgPrint("InsertFinancialStatement: Insert Completed: " + str(data_fs)) return(0)
def populate_date(date): """ Try to find date patterns and convert them to default one. """ v = date.strip() patterns = [ (r"^D:(\d\d\d\d)(\d\d)(.*)$", r"\1-\2"), (r"^(\d\d\d\d)(\d\d)(.*)$", r"\1-\2"), (r"^(\d+)/\d+/(\d\d+) \d\d(.*)$", r"\2-\1"), (r"^.* (\d\d\d\d)(?:\W+.*$|$)", r"\1-??"), ] for (pat, repl) in patterns: v_new = re.sub(pat, repl, v) if v != v_new: return v_new.replace("-0", "-") # remove 0 from month return "??" # better to avoid javascript problems and indicate error
def generate_payout_dates_cd( payout_file_path, trading_date_file, expected_seg="NseCD", year=2020): ##Generate future payout dates till given year sanity_check(payout_file=payout_file_path, trading_date_file=trading_date_file, matched_with=expected_seg) nse_cd_dates = defaultdict( dict) ## dict(year:(dict(month:[list of business days]))) _, last_dt = string_to_date( extract_last_line(payout_file_path).strip("\n")) with open(trading_date_file, "r") as f: f.readline() ##Skipping first line for date in f: date = date.strip("\n") ## removing extra "\n" from date _, dt = string_to_date(date) if dt > last_dt: ## adding all those date which is greter than last date in given payout file _year, month, day = date.split("-") month_dict = nse_cd_dates[ _year] ## month -> [list of trading days] if month not in month_dict: month_dict[month] = [day] else: month_dict[month] += [day] nse_cd_dates[_year].update(month_dict) result = [] with open(payout_file_path, "a") as fout: for _year in nse_cd_dates: if int(_year) <= year: for month in nse_cd_dates[_year]: if int( month ) > last_dt.month: ## Add date Only if month is greater than last appended date fout.write(_year + "-" + month + "-" + nse_cd_dates[_year][month][-3] + "\n") result += [ _year + "-" + month + "-" + nse_cd_dates[_year][month][-3] ] logging.debug("appended dates: %s" % result) return result
def __init__(self,well_id, date, buff_pressure, annular_pressure, line_pressure): self.well_id = well_id.strip() try: self.date = date.strip() self.date = datetime.strptime(date, '%d.%m.%Y') except: self.date=date try: self.annular_pressure = float(annular_pressure) except: self.annular_pressure = 0 try: self.buff_pressure = float(buff_pressure) except: self.buff_pressure = 0 try: self.line_pressure = float(buff_pressure) except: self.line_pressure = 0
def __init__(self,well_id, date, bhp, form_pressure1, form_pressure2): self.well_id = well_id.strip() try: self.date = date.strip() self.date = datetime.strptime(date, '%d.%m.%Y') except: self.date=date try: self.bhp = float(bhp) except: self.bhp = 0 try: self.form_pressure1 = float(form_pressure1) except: self.form_pressure1 = 0 try: self.form_pressure2 = float(form_pressure2) except: self.form_pressure2 = 0
def check_record(sID, date, category, table): try: if(date.strip() == ""): cursor.execute( \ "SELECT COUNT(1) FROM " + table + " WHERE StockID = %s limit 1", (sID,)) else: if(category.strip() == ""): cursor.execute( \ "SELECT COUNT(1) FROM " + table + " WHERE CoId = %s and Date = %s limit 1", (sID, date)) else: cursor.execute( \ "SELECT COUNT(1) FROM " + table + " WHERE CoId = %s and Date = %s and Category = %s limit 1", (sID, date, category)) res = cursor.fetchone() if res[0] > 0: dbgPrint("cehck_record: Error: Record already exist") return(-1) except mcon.Error as err: dbgPrint("cehck_record: DB Error, table[" + table + "] err [" + str(err) + "] ") return(-1) dbgPrint("check_record Completed: table [" + table + "] [" + str(sID) + "]") return(0)
def build_input_data(date, hour_hh, muni_indices): # Check NOAA if data is complete. data_finished(hour_hh.strip(), date.strip()) url_list = create_grib_url_list(date, hour_hh) file_names = download_all_grib(url_list) # Initialize GroupedArray named muni_data_bank. muni_data_bank = GroupedArray(muni_indices.keys()) # For each downloaded file, fill muni_data_bank with extracted data. for each_file in tqdm(file_names, total=len(file_names), desc="Parsing grib files"): if not parse_grib(each_file, muni_indices, muni_data_bank): send_error_email(ERROR_NAM_MESSAGE_1 % get_path_dir('input_data', 'grib_test.grib2')) raise Exception( 'grib_grab shouldn\'t fail if data_finished method succeeds. Check data for %s' % "00") output_str = write_json_data(muni_data_bank, hour_hh) return output_str
def iso8601date(date, format=None): """Convert a date to ISO8601 date format input format: YYYY-MM-DD HH:MM:SS GMT (works less reliably for other TZs) or YYYY-MM-DD HH:MM:SS.0 or YYYY-MM-DD or epoch (13 digit, indicating ms) or epoch (10 digit, indicating sec) output format: iso8601 """ date = date.strip() if format: try: return datetime.strptime(date, format).isoformat() except Exception: pass try: return datetime.strptime(date, "%Y-%m-%d %H:%M:%S %Z").isoformat() except Exception: pass try: return datetime.strptime(date, "%A, %b %d, %Y").isoformat() except Exception: pass try: return datetime.strptime(date, "%Y-%m-%d %H:%M:%S.0").isoformat() except: pass try: return datetime.strptime(date, "%Y-%m-%d").isoformat() except: pass try: return datetime.strptime(date, "%b %d, %Y").isoformat() except: pass try: return datetime.strptime(date, "%B %d, %Y").isoformat() except: pass try: return datetime.strptime(date, "%B %d, %Y %I:%M %p").isoformat() except: pass try: date = int(date) if 1000000000000 < date and date < 9999999999999: # 13 digit epoch return datetime.fromtimestamp(mktime(gmtime(date / 1000))).isoformat() except: pass try: date = int(date) if 1000000000 < date and date < 9999999999: # 10 digit epoch return datetime.fromtimestamp(mktime(gmtime(date))).isoformat() except: pass # If all else fails, return input return ''
def _set_date(self, date): if isinstance(date, datetime): self.db_date = date elif isinstance(date, basestring) and date.strip() != '': newDate = datetime(*strptime(date, '%d %b %Y %H:%M:%S')[0:6]) self.db_date = newDate
def _get_case_dates(self): path = "{base}/td[1]/text()".format(base=self.base) return [convert_date_string(date.strip()) for date in self.html.xpath(path) if date.strip()]
def scrape_links(links): maincleaner = Cleaner(allow_tags=['div'], remove_unknown_tags=False, remove_tags=['div']) # funtion to remove every tag # while True: for link in links: # Loop through all the links if link == last_link: # Check if this link has already been scraped (this will eventually be changed to check dates) break # If we've hit something we've already scraped, break out of the loop # try: linkhtml = scraperwiki.scrape(link).decode('latin_1') # scrape the contents of the current link and decode from Windows-1252 encoding print link root = lxml.html.fromstring(linkhtml) # turn scraped content into an HTML object # GET TITLE title = root.cssselect("h1")[0].text.encode('utf-8') # grab the page header (title) and return its text as unicode title = replace_all(title, subDic) # replace alphanumeric obfuscations with letters # GET DATE date = root.cssselect("div.adInfo")[0].text # get the text of the html entity that contains the date and time of the post cleandate = re.sub(r'(\S+\s+\d+,\s+\d\d\d\d)(?:,?) (\d+\:\d+ \w\w)', r'\1 \2', date.strip()) # get date into a standard format cleandate = re.search(r'\S+\s+\d+, \d\d\d\d \d+\:\d+ \w\w', cleandate).group(0) # find the date string on the page rawdate = datetime.strptime(cleandate,'%B %d, %Y %I:%M %p') # encode the date as a date using format Month dd, YYYY date = rawdate.strftime('%Y-%m-%d %H:%M') # decode that date back into a string of format YYYY-mm-dd # GET MAIN BODY TEXT mainwithtags = root.cssselect("div.postingBody")[0] # grabs the body text of the post main = maincleaner.clean_html(mainwithtags).text.encode('utf-8') # gets rid of all HTML tags main = replace_all(main, subDic) # replace alphanumeric obfuscations with letters # GET PHONE NUMBER(S) stripped = replace_all(main.lower(), wordDic) # replaces common phone number obfuscations with actual numbers phonecomp = re.compile("[\s\-/=\.,{}_\!\@\#\$\%\^\&\*\(\)\~]") # list of known phone number dividers stripped = phonecomp.sub('',stripped) # remove phone number dividers phone = re.findall(r'(?:1?)[1-9]\d{9}',stripped) # search for groups of 10 consecutive numbers (with an optional preceding 1) phone = list(set(phone)) # gets rid of duplicate numbers by turning list into a set and back phone = ", ".join(phone) # formats phone numbers as "phone1, phone2,... phoneN" # GET LISTED AGE if root.cssselect("p.metaInfoDisplay"): # does the entry have metainfo? listedage = root.cssselect("p.metaInfoDisplay")[0] # get the the first html metainfo element listedage = re.sub("[^\d]","",listedage.text) # get rid of all non-numeric text in the text of the element else: # if there's no metainfo listedage = "" # set the listed age to an empty string # GET LOCATION if re.findall(r'Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL): # location = re.findall('Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL)[0].encode('utf-8') # location = removeNonAscii(location) #if any(x in NEIGHBORHOODS) in location: # print x, 'x' # area = x area = None for neighborhood in NEIGHBORHOODS: if neighborhood in location.lower(): area = neighborhood print repr(area) print repr(location) else: location = "" picturelist=[] pictures = root.cssselect('ul#viewAdPhotoLayout img') for i in range(len(pictures)): largepic = re.sub('/medium/','/large/',pictures[i].get('src')) picturelist.append(largepic) print picturelist picturelist = " ".join(picturelist) x = urllib.urlopen(largepic).read() piccode = base64.encodestring(x) print piccode # except: # print 'FAILED TO LOAD: ' + link # continue # record = {} # record['Title'] = 'LOAD FAILURE' # Set up our data record - we'll need it later record = {} record['Title'] = title #.encode('ascii', 'ignore').strip() record['Date'] = date record['Main'] = main #.encode('ascii', 'ignore').strip() record['Pictures'] = picturelist record['Phone'] = phone record['Listed Age'] = listedage #.encode('ascii', 'ignore').strip() record['Location'] = location record['area']= area record['PicCode'] = piccode #.encode('ascii', 'ignore').strip() # Print out the data we've gathered #print record, '------------' # Finally, save the record to the datastore - 'Artist' is our unique key scraperwiki.sqlite.save(["Title"], record) time.sleep(2)
def handle(self, *args, **options): base_url = "http://www.dsca.mil/" results = [] # create list of links to search links2archives = [] year = int(datetime.strftime(datetime.today().date(), "%Y")) month = int(datetime.strftime(datetime.today().date(), "%m")) if month == 1: month_limit = 12 year_limit = year - 1 else: month_limit = month - 1 year_limit = year # set to 2008 for full records while year >= year_limit: while month >= month_limit: if year == 2008 and month == 05: break if len(str(month)) < 2: month_format = "0" + str(month) else: month_format = str(month) link = "http://www.dsca.mil/major-arms-sales/archives/" + str( year) + month_format links2archives.append(link) month = month - 1 month = 12 year = year - 1 # find titles and links to pages for link in links2archives: print "working on ", link archive_page = soupify(link) archive_body = archive_page.select(".view-content") # there are not entries for every month try: archive_body = archive_body[1] except IndexError: continue info = archive_body.select(".mas-regions") # find info for each for profile in info: links2pages = profile.find_all("a") pagelink = links2pages[0].get("href") pagelink = base_url + pagelink try: existing_record = Proposed.objects.get(dsca_url=pagelink) print "exists" except: title = links2pages[0].text date_p = profile.find_all("div")[-1] date_p = date_p.text if "Defense Security Cooperation Agency\n" in date_p: date_p = date_p.replace( "Defense Security Cooperation Agency\n", "") date = date_p.split(u"–") date = date[0] date = date.replace("WASHINGTON, ", "") date = date.strip() if len(date) > 25: date = date_p.split("-") date = date[0] date = date.replace("WASHINGTON, ", "") date = date.strip() if len(date) > 25: date = date_p.split("--") date = date[0] date = date.replace("WASHINGTON, ", "") date = date.strip() try: date_obj = datetime.strptime(date, "%b %d, %Y") except: if "Sept." in date or "Sept " in date: date = date.replace("Sept", "Sep") try: date_obj = datetime.strptime(date, "%b. %d, %Y") except: pass try: date_obj = datetime.strptime(date, "%B %d, %Y") except: date_obj = None # looking at individual page page = soupify(pagelink) # a few don't have pdfs try: pdf_link = page.select(".file")[0].find_all("a") except: pdf_link = None if pdf_link != None: pdf_link = pdf_link[0].get("href") data_text = '' field_text = page.select(".field-item") for d in field_text: data_text = data_text + "\n" + d.text record = Proposed( title=title, text=data_text, date=date_obj, dsca_url=pagelink, pdf_url=pdf_link, ) country = title.split(u"–") if len(country) <= 1: country = title.split(u"-") country = country[0] country = country.replace("Government of ", "") country = country.replace("The ", "") country = country.strip() cleaning = { "Iraq F": "Iraq", "Republic of Korea": "South Korea", "Republic of Korea (ROK)": "South Korea", "United Arab Emirates (UAE)": "United Arab Emirates", "Taipei Economic and Cultural Representative Office in the United States": "Taiwan", "Kingdom of Morocco": "Morocco" } if cleaning.has_key(country): country = cleaning[country] try: matching_loc = Location.objects.get(location=country) loc_id = int(matching_loc.id) record.location_id = loc_id record.location = matching_loc.location print loc_id except: matching_loc = None record.save() print "added record %s" % (record) # #save to amazon try: file_name = "arms_pdf/" + str(record.id) + ".pdf" pdf_link = str(pdf_link) u = urllib2.urlopen(pdf_link) localFile = default_storage.open(file_name, 'w') localFile.write(u.read()) localFile.close() except: print 'not working' message = 'bad upload ' + title logger.error(message) results.append({ "title": title, "date": date, "link": pagelink, "pdf_link": pdf_link, "text": data_text }) try: doc = { 'title': title, 'text': data_text, 'location': record.location, 'location_id': record.location_id, 'date': record.date, } print "made doc" res = es.index(index="foreign", doc_type='arms', id=record.id, body=doc) except: message = 'bad pdf no elasticsearch upload for - %s' % ( title) logger.error(message) print title
def handle(self, *args, **options): base_url = "http://www.dsca.mil/" results= [] # create list of links to search links2archives = [] year = int(datetime.strftime(datetime.today().date(), "%Y")) month = int(datetime.strftime(datetime.today().date(), "%m")) if month == 1: month_limit = 12 year_limit = year - 1 else: month_limit = month - 1 year_limit = year # set to 2008 for full records while year >= year_limit: if year == 2008 and month == 05: break while month >= month_limit: if year == 2008 and month == 05: break if len(str(month)) < 2: month_format = "0" + str(month) else: month_format = str(month) link = "http://www.dsca.mil/major-arms-sales/archives/" + str(year) + month_format links2archives.append(link) month = month - 1 month = 12 year = year - 1 # find titles and links to pages for link in links2archives: print "working on ", link archive_page = soupify(link) archive_body = archive_page.select(".view-content") # there are not entries for every month try: archive_body = archive_body[1] except IndexError: continue info = archive_body.select(".mas-regions") # find info for each for profile in info: links2pages = profile.find_all("a") pagelink = links2pages[0].get("href") pagelink = base_url + pagelink try: existing_record = Proposed.objects.get(dsca_url=pagelink) except: title = links2pages[0].text date_p = profile.find_all("div")[-1] date_p = date_p.text if "Defense Security Cooperation Agency\n" in date_p: date_p = date_p.replace("Defense Security Cooperation Agency\n", "") date = date_p.split(u"–") date = date[0] date = date.replace("WASHINGTON, ", "") date = date.strip() if len(date) > 25: date = date_p.split("-") date = date[0] date = date.replace("WASHINGTON, ", "") date = date.strip() if len(date) > 25: date = date_p.split("--") date = date[0] date = date.replace("WASHINGTON, ", "") date = date.strip() try: date_obj = datetime.strptime(date, "%b %d, %Y") except: if "Sept." in date or "Sept " in date: date = date.replace("Sept", "Sep") try: date_obj = datetime.strptime(date, "%b. %d, %Y") except: pass try: date_obj = datetime.strptime(date, "%B %d, %Y") except: date_obj = None # looking at individual page page = soupify(pagelink) print_link = page.select(".print_html")[0].find_all("a") print_link = print_link[0].get("href") # a few don't have pdfs try: pdf_link = page.select(".file")[0].find_all("a") except: pdf_link = None if pdf_link != None: pdf_link = pdf_link[0].get("href") print_page = soupify(print_link) data_text = print_page.select(".print-content")[0] data_text = data_text.text record = Proposed( title = title, text = data_text, date = date_obj, dsca_url = pagelink, pdf_url = pdf_link, print_url = print_link, ) country = title.split(u"–") if len(country) <= 1: country = title.split(u"-") country = country[0] country = country.replace("Government of ", "") country = country.replace("The ", "") country = country.strip() cleaning = {"Iraq F":"Iraq", "Republic of Korea":"South Korea", "Republic of Korea (ROK)":"South Korea", "United Arab Emirates (UAE)":"United Arab Emirates", "Taipei Economic and Cultural Representative Office in the United States":"Taiwan", "Kingdom of Morocco":"Morocco"} if cleaning.has_key(country): country = cleaning[country] try: matching_loc = Location.objects.get(location=country) loc_id = int(matching_loc.id) record.location_id = loc_id except: matching_loc = None record.save() #save to amazon try: file_name = "arms_pdf/" + str(record.id) + ".pdf" pdf_link = str(pdf_link) u = urllib2.urlopen(pdf_link) localFile = default_storage.open(file_name, 'w') localFile.write(u.read()) localFile.close() except: print 'not working' message = 'bad upload ' + title logger.error(message) results.append({"title":title, "date":date, "link": pagelink, "pdf_link":pdf_link, "print_link":print_link, "text": data_text}) print title
job_listings = soup.find_all('div', {'class': 'job-listing-job-item'}) for job_listing in job_listings: job_description = job_listing.find_all('span') # Get job title and link job_title = job_description[0].a.text info_link = 'https://recruiting.paylocity.com' + job_description[0].a[ 'href'] job_summary = info_link # Get date as string date = job_description[1].text # Clean up date string by removing trailing -'s, then split and convert to datetime object if date[len(date) - 2] == '-': date = date[0:len(date) - 3] date = date.strip().split('/') month = int(date[0]) day = int(date[1]) year = int(date[2]) job_post_date = datetime(year, month, day) # Get Location job_location = job_listing.find('div', { 'class': 'location-column' }).span.text # Get soup of job listing to scrape more info listing_soup = get_soup(info_link) listing_body = listing_soup.find('body').find_all('p') # Retrieve Full/Part-time and Salary info if available if 'Location' in listing_body[0].text: location_string = listing_body[0].text.split(':')[1].lstrip() zip_code_result = re.search(r'(\d{5})', location_string)
def parse_date(date, lang): d = dateparser.parse(date.strip(), languages=[lang]) return d
def parse_pub_dates(request): return [ date.strip() for date in request.POST["pub_date"].split("\n") if date.strip() != "" ]
def render(): global stages, teams, places, rounds, space, games, places, dic_slice_2_games, dates, tournaments, tournamentPos, goals sliceId = 0 shares = { "teams": 350, "calendar": 600 // 3, "places": 650 // 3, "stages": 400 - 650 // 3 } space = (1000 - (shares["teams"] + shares["calendar"] + shares["places"] + shares["stages"])) // 4 calendar = [] #список команд for t in teams: t["value"] = shares["teams"] / len(teams) t["color"] = "#4daa4b" t["sliceId"] = sliceId t["id_group"] = 0 teams_name_dic[str(t["id"])] = t["name"] dic_sliceId[sliceId] = 0 dic_name2sliceId[str(t["id"])] = sliceId dic_sliceId2name[sliceId] = t["id"] sliceId += 1 sliceId += 1 #для пространства #календарь игр for date in dates: c = {} strTime = getNormalDate(date.strip()) Time = datetime.strptime(strTime.strip(), '%Y-%m-%d') c["value"] = shares["calendar"] / len(dates) c["name"] = Time.strftime(" %d %B") + " " + Time.strftime( "%A")[0:3] + "." c["color"] = "#ddea4f" c["sliceId"] = sliceId c["id_group"] = 1 dic_sliceId[sliceId] = 1 dic_name2sliceId[strTime] = sliceId dic_sliceId2name[sliceId] = strTime sliceId += 1 calendar.append(c) sliceId += 1 #для пространства #стадион + город for p in places: p["name"] = p["stadium"].split("|")[0] + ";" + p["city"] p["value"] = shares["places"] / len(places) p["color"] = "#4a69a9" p["sliceId"] = sliceId p["id_group"] = 2 dic_sliceId[sliceId] = 2 dic_name2sliceId[p["id"]] = sliceId dic_sliceId2name[sliceId] = p["id"] sliceId += 1 sliceId += 1 #для пространства #раунды for s in stages: s["value"] = shares["stages"] / len(stages) s["color"] = "#a89449" s["sliceId"] = sliceId s["id_group"] = 3 dic_sliceId[sliceId] = 3 dic_name2sliceId["s" + str(s["id"])] = sliceId dic_sliceId2name[sliceId] = "s" + str(s["id"]) sliceId += 1 #какие игры показываем при клике for i in range(sliceId): dic_slice_2_games[i] = [] click_events = [] for curSlice in range(sliceId): click_events.append({ "key": curSlice, "value": getConnectionBySliceId(curSlice) }) slice_name = [] for d in dic_slice_2_games: slice_name.append({"key": d, "value": dic_slice_2_games[d]}) dic_slice_2_games = {} return render_template("world_cup2.html", teams=teams, rounds=calendar, places=places, stages=stages, space=space, outGroups=outGroups, click_events=click_events, games_clear=games_clear, slice_name=slice_name, games_playoff=games_playoff, tournaments=tournaments, tournamentPos=tournamentPos, goals=goals)
def iso8601date(date, date_format=None): """Convert a date to ISO8601 date format input format: YYYY-MM-DD HH:MM:SS GMT (works less reliably for other TZs) or YYYY-MM-DD HH:MM:SS.0 or YYYY-MM-DD or epoch (13 digit, indicating ms) or epoch (10 digit, indicating sec) output format: iso8601""" date = date.strip() if date_format: try: return datetime.strptime(date, date_format).isoformat() except Exception: pass try: return datetime.strptime(date, "%Y-%m-%d %H:%M:%S").isoformat() except: pass try: # Friday, October 2, 2015 1:35 AM return datetime.strptime(date, "%A, %B %d, %Y %I:%M %p").isoformat() except: pass try: # Friday, 2 October 2015, 18:23 return datetime.strptime(date, "%A, %d %B %Y, %H:%M").isoformat() except: pass try: # Thu October 01st, 2015 return datetime.strptime(date, "%a %B %dst, %Y").isoformat() except: pass try: # Thu October 02nd, 2015 return datetime.strptime(date, "%a %B %dnd, %Y").isoformat() except: pass try: # Thu October 03rd, 2015 return datetime.strptime(date, "%a %B %drd, %Y").isoformat() except: pass try: # Thu October 04th, 2015 return datetime.strptime(date, "%a %B %dth, %Y").isoformat() except: pass try: return datetime.strptime(date, "%Y-%m-%d %H:%M:%S %Z").isoformat() except Exception: pass try: return datetime.strptime(date, "%A, %b %d, %Y").isoformat() except Exception: pass try: return datetime.strptime(date, "%Y-%m-%d %H:%M:%S.0").isoformat() except: pass try: return datetime.strptime(date, "%Y-%m-%d").isoformat() except: pass try: return datetime.strptime(date, "%b %d, %Y").isoformat() except: pass try: return datetime.strptime(date, "%B %d, %Y").isoformat() except: pass try: return datetime.strptime(date, "%B %d, %Y %I:%M %p").isoformat() except: pass try: return datetime.strptime(date, "%b %d, %Y at %I:%M %p").isoformat() except: pass try: return datetime.strptime(date, "%m-%d-%Y").isoformat() except: pass try: return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").isoformat() except: pass try: return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").isoformat() except: pass try: date = int(date) if 1000000000000 < date < 9999999999999: # 13 digit epoch return datetime.fromtimestamp(mktime(gmtime(date / 1000))).isoformat() except: pass try: date = int(date) if 1000000000 < date < 9999999999: # 10 digit epoch return datetime.fromtimestamp(mktime(gmtime(date))).isoformat() except: pass # If all else fails, return empty return ''
def _set_date(self, date): if type(date) == datetime: self.db_date = date elif type(date) == type('') and date.strip() != '': newDate = datetime(*strptime(date, '%d %b %Y %H:%M:%S')[0:6]) self.db_date = newDate
def odd_link(b, date, l, directory): text = b.get_text() # not links to docs try: link = l.get("href") except: pass # these are not documents if "link" in locals(): if link[-4:] == ".gov": return {"date_string":False, "real_title":False} elif link[-5:] == ".gov/" or link == "/usao/eousa/index.html": return {"date_string":False, "real_title":False} text = b.get_text() #section for documents without dates: if date != None: if date.strip() == "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995": return {"date_string": "June 1, 1996", "real_title": "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995"} if date == "Audit Report GR-30-00-001": return {"date_string": "November 1, 2000", "real_title":"McMechen, West Virginia Police Department, Audit Report GR-30-00-001"} # no date, one other entry, giving it the same date if date == "Georgia's Department of Corrections": return {"date_string": "November 1, 2000", "real_title":"United States Marshals Service Cost Proposal for the Intergovernmental Service Agreement for Detention Facilities with the City of Atlanta, Georgia’s Department of Corrections"} # confirmed no dates for these no_dates = ("Audit Report GR-40-99-014", "Audit Report GR-40-99-011", "Evaluation and Inspections Report I-2000-021", "Evaluation and Inspections Report I-2000-018", "Audit Report 99-03") if date.strip() in no_dates: date_string = datetime.now() date_string = datetime.strftime(date_string, "%B %d, %Y") return {"date_string": date_string, "real_title": text} # Intergovernmental Agreements for Detention Space External Reports don't always have dates, not even on the documents, using today if directory == "Intergovernmental Agreements for Detention Space (IGAs)": date_string = datetime.now() date_string = datetime.strftime(date_string, "%B %d, %Y") return {"date_string": date_string, "real_title": text} # need to get rid of this to process if "Released Publicly" in text: date = text date = re.sub(r'\([^)]*\)', '', date) date = re.sub(r'\[(.*?)\]', '', date) date = date.replace("Released Publicly", '') date_chopped = date.rsplit(',') day = date_chopped[-1] date = day.strip() if day.isdigit(): date_string = date_chopped[-2] + "," + date_chopped[-1] if "," not in date: date = date.strip() date = date.replace(" ", " 1, ") return{"date_string": date, "real_title": text} if "Revised" in text: date = text date = re.sub(r'\([^)]*\)', '', date) date = re.sub(r'\[(.*?)\]', '', date) date = date.replace("Revised", '') date_chopped = date.rsplit(',') day = date_chopped[-1] date = day.strip() if day.isdigit(): date_string = date_chopped[-2] + "," + date_chopped[-1] if "," not in date: date = date.strip() date = date.replace(" ", " 1, ") return{"date_string": date, "real_title": text} if date != None: date = date.strip # case 1, date is wrong because it is in the paragraph and completely written out try: date = b.string date_string = date_format(date) title = b.string except: # these are lists of links that are different variants of the same report in a list # case where there is a list in a paragraph tag listy = b.parent.parent text = str(listy.previous_sibling) title = text # case where there is a paragraph above a list if len(text) < 4: listy = b.parent.parent text = listy.previous_sibling.previous_sibling title = str(text)[3:-4] date = re.sub(r'\([^)]*\)', '', title) date = re.sub(r'\[[^)]*\]', '', date) date = date.rsplit(',') date_string = date[-1] date_string = date_string.strip() if "," not in date_string: date_string = date_string.replace(" ", " 1, ") # for the DOJ combined page if date_string == 'id="content" 1, name="content">': text = b.text text = re.sub(r'\([^)]*\)', '', text) chunks = text.split(",") day_piece = chunks[-1] day_chunks = day_piece.split('—') day = day_chunks[0] day = day.strip() day = day.replace(" ", " 1, ") date_string = day title = b.text ## uncomment for debugging # try: # date = datetime.strptime(date_string, "%B %d, %Y") # except: # print('hit one') # print("b: ", b.text) # print("l: ", l) # print("date: ", date) # print("date string", date_string) # print("directory", directory) # exit() info = {"real_title":title, "date_string": date_string, } return(info)