Exemple #1
0
def crear_trabajo(cron, date, hour, gramos):
    date=date.strip("/").split()
    hour=date.strip(":").split()
    job  = cron.new(command='python archivos.py '+gramos)
    job.setall(datetime(int(str(date.today().year)), date[0], date[1], hour[0], hour[1]))
    job.enable()
    cron.write_to_user(user="******")
Exemple #2
0
 def _get_case_dates(self):
     path = "{base}/td[1]/text()".format(base=self.base)
     return [
         convert_date_string(date.strip())
         for date in self.html.xpath(path)
         if date.strip()
     ]
Exemple #3
0
def scrapeBaramDom():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader('http://www.baramdom.com/')
    content = down.get_content()
    html = unicode(content)
    p = xpath.get(html, '//div[@class="box post"]')
    linkovi = xpath.search(p, '//div[@class="content"]')
    ads = []
    for l in linkovi:
        link = "http://www.baramdom.com" + xpath.get(l, '//div[@class="post-title"]/h2/a/@href')
        title = xpath.get(l, '//div[@class="post-title"]/h2/a')
        imageUrl = xpath.get(l, '//a[@class="grouped"]/img/@src')
        if imageUrl == "":
            imageUrl = "http://www.baramdom.com/img/apartment_noimage.png"
        else:
            imageUrl = "http://www.baramdom.com" + imageUrl
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        description = xpath.get(cont, '//p[@class="post_add_desc"]')
        description = description.strip()
        category = u"Недвижнини"
        ost = xpath.get(l, '//p[@class="add-title"]')
        ost = ost.strip()
        ost = ost.split(" во ")
        region = ost[1]
        country = u"Македонија"
        k = ost[0]
        k = k.split("ам ")
        subcategory = k[1]
        price = xpath.get(cont, '//div[@class="post-add"]/p[@class="last"]').strip()
        price = price.split(" ")
        if len(price)==3:
            value = "/"
            currency = "/"
        else:
            value = price[0]
            currency = price[1]
            if currency == "Euro.":
                currency = "EUR"
            elif currency == u"Ден.":
                currency = "MKD"
        date = xpath.get(l, '//div[@class="fl"]')
        date = date.strip()
        date = date.split(">")
        date = date[1]
        date = date.strip()
        date = date.split(" ")
        date = date[0]
        date = date.split("-")
        date = date[2]+"-"+date[1]+"-"+date[0]
        ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country)    
        ads.append(ad)
    return adsToJson(ads)
#print scrapeBaramDom()
Exemple #4
0
 def date_fetcher(self):
     self.dates = []
     self.date_index = []
     for line in self.raw_url:
         
         #looks for the first sign of a date
         #and removes emptpy spaces on either side
         if """<td class="col1">""" in line:
             date = line.strip(" ")
             
             #ensures it is a date with a simple number check at an index
             #appends the final dates to a list
             #also records the index of each date for finding the related metric
             if line[-6] == "0" or line[-6] == "1" or line[-6] == "2" or line[-6] == "3" or line[-6] == "4" or line[-6] == "5" or line[-6] == "6" or line[-6] == "7" or line[-6] == "8" or line[-6] == "9":  
                 date = date[17:]
                 date = date.strip("</td>")
                 self.dates.append(date)
                 self.date_index.append(self.raw_url.index(line))
             else:
                 pass
         else:
             pass
     
     self.dates =  self.dates[0:35]
     return self.dates
Exemple #5
0
 def __init__(self,well_id, name, date, gor, wc,
 oil, gas, water, injection, oil_cum, gas_cum, water_cum, injection_cum,
 liquid_form, liquid_form_cum, gas_form, gas_form_cum, injection_form, injection_form_cum, work_time, work_time_cum):
     self.well_id = well_id.strip()
     self.name = name.strip()
     try:
         self.date = date.strip()
         self.date = datetime.strptime(date, '%Y%m%d')
     except:
         self.date=date           
     self.gor = float(gor)
     self.wc = float(wc)
     self.oil = float(oil)
     self.gas = float(gas)
     self.water = float(water)
     self.injection = float(injection) 
     self.oil_cum = float(oil_cum)
     self.gas_cum = float(gas_cum)
     self.water_cum = float(water_cum)
     self.injection_cum = float(injection_cum)
     self.liquid_form = float(liquid_form)
     self.liquid_form_cum = float(liquid_form_cum)
     self.gas_form = float(gas_form)
     self.gas_form_cum = float(gas_form_cum)
     self.injection_form = float(injection_form)
     self.injection_form_cum=float(injection_form_cum)
     self.work_time = float(work_time)
     self.work_time_cum = float(work_time_cum)
Exemple #6
0
def update_serie(sender, instance, **kwargs):
    serie_id = instance.imdb_id

    page = requests.get('http://www.imdb.com/title/' + serie_id)
    tree = html.fromstring(page.text)

    saison_list = tree.xpath('//*[@id="title-episode-widget"]/div/div[3]/a/text()')

    for saison in saison_list:
        if str(saison).isdigit():

            new_saison = Saison.objects.get(movie=instance, saison_number=saison)
            if new_saison is None:
                new_saison = Saison(movie=instance, saison_number=saison)
                new_saison.save()

            page = requests.get('http://www.imdb.com/title/' + serie_id + '/episodes?season=' + saison)
            tree = html.fromstring(page.text)
            episode_list_name = tree.xpath('//*[@itemprop="episodes"]/strong[1]/a/text()')
            episode_list_date = tree.xpath('//*[@itemprop="episodes"]/div[1]/text()')

            for name, date in zip(episode_list_name, episode_list_date):
                release_date = None
                # todo make a lot better
                try:
                    release_date = datetime.strptime(date.strip(), '%d %b. %Y')
                except ValueError:
                    pass

                try:
                    release_date = datetime.strptime(date.strip(), '%d %b. %Y')
                except ValueError:
                    pass

                # todo make a lot better (utilisé un get???)
                episodes = Episode.objects.filter(saison=new_saison, episode_name=name)
                # todo make a lot better
                if episodes.__len__() == 0:
                    episode = Episode(saison=new_saison, episode_name=name)
                else:
                    # todo make a lot better (juste ne pas faire ca [0])
                    episode = episodes[0]
                    episode.release_date = release_date

                episode.save()
def InsertFoundationExchange(stockID, ForeignInvestorBuy, ForeignInvestorSell, \
							 InvestmentTrustBuy, InvestmentTrustSell, \
							 DealerBuy, DealerSell, TotalVolume, Category, date):

	if (stockID.strip()=="" or \
			ForeignInvestorBuy.strip()=="" or ForeignInvestorSell.strip()=="" or \
			InvestmentTrustBuy.strip()=="" or InvestmentTrustSell.strip()=="" or \
			DealerBuy.strip()=="" or DealerSell.strip()=="" or \
			TotalVolume.strip()=="" or Category.strip()=="" or date.strip()==""):
		dbgPrint("InsertFoundationExchange: Parameters cannot be empty")
		return(-1)

	try:
		valid_date(date)

		# Get CoId
		cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,))
		row = cursor.fetchall()
		if(cursor.rowcount <= 0):
			dbgPrint("InsertFoundationExchange: Error: Cannot locate Company ID"  + str(stockID) +":"+ str(cursor.rowcount))
			return(-1)

		# check for duplicate, i.e. same coID, same date and same category
		if(check_record(str(row[0][0]), date, Category, "FoundationExchange") != 0):
			dbgPrint("InsertFoundationExchange (Error): Record already exist:: coID[" + str(row[0][0]) + "] date[" + str(date) + "] Category[" + str(Category) + "]")
			return(-1)

		add_fs = ("INSERT INTO FoundationExchange (CoId, ForeignInvestorBuy, ForeignInvestorSell, " \
				"InvestmentTrustBuy, InvestmentTrustSell, DealerBuy, DealerSell, TotalVolume, Category, date) " \
			"VALUES (%(_coid)s, %(_foreigninvestorbuy)s, %(_foreigninvestorsell)s, %(_investmenttrustbuy)s, %(_investmenttrustsell)s, " \
				"%(_dealerbuy)s, %(_dealersell)s, %(_totalvolume)s, %(_category)s, %(_date)s)")

		data_fs = {
			'_coid'				   : int(row[0][0]),
			'_foreigninvestorbuy'  : int(ForeignInvestorBuy),
			'_foreigninvestorsell' : int(ForeignInvestorSell),
			'_investmenttrustbuy'  : int(InvestmentTrustBuy),
			'_investmenttrustsell' : int(InvestmentTrustSell),
			'_dealerbuy'		   : int(DealerBuy),
			'_dealersell'		   : int(DealerSell),
			'_totalvolume'		   : int(TotalVolume),
			'_category'		   : int(Category),
			'_date': date,}

		cursor.execute(add_fs, data_fs)
		db.commit()

	except mcon.Error as err:
		dbgPrint("FoundationExchange: DB Error [" + str(err) + "] ")
		return(-1)
	except Exception as e:
		dbgPrint("FoundationExchange (ERROR): Exception")
		dbgPrint(e)
		raise Exception

	dbgPrint("FoundationExchange: Insert Completed: " + str(data_fs))
	return(0)
Exemple #8
0
def get_milestone_date(milestone):
    date = ''
    ndx = milestone.find(' ') + 1
    if ndx > -1:
        date = milestone[ndx:ndx + 11] # date must be in format 2020-01-01
        date = date.strip()
        dateparts = []
        if date:
            dateparts = date.split('-')
        try:
            int(dateparts[0])
            int(dateparts[1])
            int(dateparts[2])
        except Exception:
            date = '' #invalid date for milestone
            pass

    return date.strip()
Exemple #9
0
def isodate(date):
   try:
      return datetime.strptime(date.strip(), '%Y-%m-%dT%H:%M:%S.%f')
   except ValueError:
      try:
         return datetime.strptime(date.strip(), '%Y-%m-%dT%H:%M:%S')
      except ValueError:
         try:
            return datetime.strptime(date.strip(), '%Y-%m-%d')
         except ValueError:
            try:
               return datetime.strptime(date.strip(), '%Y-%m')
            except ValueError:
               try:
                  return datetime.strptime(date.strip(), '%Y')
               except ValueError:
                  log.error('date could not be decoded: %s' % date)
   return None
    def parse_movies(self, div, schedule):
        items = []
        localtz = timezone('Europe/Lisbon');

        for movie in schedule.select('div[@class="%s"]' % div):
            title = movie.select('div[@class="infoTitleProg"]/text()').extract()[0]
            desc = ""

            try:
              #realizador
              infoBiblio = movie.select('div[@class="infoBiblio"]/text()').extract()
              #0 - realizador
              director = infoBiblio[0].strip(' \t\n\r')
              #1 - actores
              actors = infoBiblio[1].strip(' \t\n\r')
              #2 - pais, data - tempo
              reg = re.compile(r'(?P<country>.*?),'
                r' +'
                r'(?P<year>\d{4})'
                r' +- +'
                r'(?P<duration>\d{,3})')
              temp = reg.match(infoBiblio[2])
              country = temp.group('country')
              year = int(temp.group('year'))
              duration = int(temp.group('duration'))

            except Exception, e:
              pass

            for info in movie.select('div[@class="infoText"]/p/text()').extract():
                desc += info.strip(' \t\n\r')
                desc += '\n'

            dates_locations = movie.select('div[@class="infoDate"]/text()').re('\d{,2}-\d{,2}-\d{4}, \d{,2}h\d{,2} \| .*')
            for date_location in dates_locations:
                item = ScheduleItem()

                # dates are stored in Lisbon utc
                date, location = date_location.split("|")
                date_obj = datetime.strptime(date.strip(' \t\n\r'), '%d-%m-%Y, %Hh%M')
                date_obj_aware = localtz.localize(date_obj);

                item['date'] = date_obj_aware
                item['title'] = title.strip(' \t\n\r')
                item['location'] = location.strip(' \t\n\r')

                try:
                  item['director'] = director
                  item['actors'] = actors
                  item['country'] = country
                  item['year'] = year
                  item['duration'] = duration
                except NameError:
                  pass

                item['desc'] = desc.strip(' \t\n\r')
                items.append(item)
Exemple #11
0
    def messageList(id):
      subject = subjects.eq(id)
      author, date = subject.parents("td.printhead").eq(0).next().text().split(",", 1)
      messages[id] = { "hash" : fromQueryString(subject.attr("href"), "mopen"),
               "subject": subject.text(),
               "author": author.strip().replace("von ", ""),
               "date": date.strip()}

      if not all:
        print self.asciiout.trim("["+str(id)+"]  "+messages[id]["author"]+": "+subject.text())
Exemple #12
0
    def parse_pro(self, response):

        sel = Selector(response)
        codelist = sel.xpath(
            '//meta[re:test(@name,"Keywords")]/@content').extract()
        code = str(codelist[0]).split(')')[0].split('(')[-1]

        stock_info = StockInfoItem()
        stock_info['code'] = code

        name = u"组织形式"
        stock_list = sel.xpath(
            '//div/table/tr/td[re:test(text(),"%s")]/following-sibling::*/text()'
            % (name)).extract()

        ownership = ""
        for stock in stock_list:
            ownership += str(stock.strip())
        if ownership != "":
            stock_info['ownership'] = ownership

        name = u"成立日期"
        stock_date = sel.xpath(
            '//div/table/tr/td[re:test(text(),"%s")]/following-sibling::*[1]//text()'
            % (name)).extract()
        found_date = ""
        for date in stock_date:
            found_date += date.strip()
        if found_date != "":
            stock_info['found_date'] = found_date

        name = u"上市日期"
        market_list_date = ""
        stock_date = sel.xpath(
            '//div/table/tr/td[re:test(text(),"%s")]/following-sibling::*[1]//text()'
            % (name)).extract()
        for date in stock_date:
            market_list_date += date.strip()
        if market_list_date != "":
            stock_info['market_list_date'] = market_list_date

        yield stock_info
def InsertMonthlyRevenue(stockID, MonthlyRevenue, LastMonthlyRevenue, LastYearMonthlyRevenue,\
		 MonthlyIncreaseRevenue, LastYearMonthlyIncreaseRevenue, \
		 CumulativeRevenue, LastYearCumulativeRevenue, CompareCumulativeRevenue,date):
		
	if (stockID.strip()=="" or \
			MonthlyRevenue.strip()=="" or LastMonthlyRevenue.strip()=="" or \
			LastYearMonthlyRevenue.strip()=="" or MonthlyIncreaseRevenue.strip()=="" or \
			LastYearMonthlyIncreaseRevenue.strip()=="" or CumulativeRevenue.strip()=="" or \
			LastYearCumulativeRevenue.strip()=="" or CompareCumulativeRevenue.strip()=="" or \
			date.strip()==""):
		dbgPrint("InsertMonthlyRevenue: Parameters cannot be empty")
		return(-1)

	try:
		valid_date(date)

		# Get CoId
		cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,))
		row = cursor.fetchall()
		if(cursor.rowcount <= 0):
			dbgPrint("InsertMonthlyRevenue: Error: Cannot locate Company ID"  +str(stockID) +":"+ str(cursor.rowcount))
			return(-1)

		if(check_record(str(row[0][0]), date, "", "MonthlyRevenue") != 0):
			dbgPrint("InsertMonthlyRevenue: Error: Record already exist, please make sure no duplicates")
			return(-1)

		print("insert db")
		add_fs = ("INSERT INTO MonthlyRevenue (CoId, MonthlyRevenue, LastMonthlyRevenue, LastYearMonthlyRevenue, MonthlyIncreaseRevenue, " \
				"LastYearMonthlyIncreaseRevenue, CumulativeRevenue, LastYearCumulativeRevenue, CompareCumulativeRevenue, date) " \
			"VALUES (%(_coid)s, %(_monthlyrevenue)s, %(_lastmonthlyrevenue)s, %(_lastyearmonthlyrevenue)s, %(_monthlyincreaserevenue)s, " \
				"%(_lastyearmonthlyincreaserevenue)s, %(_cumulativerevenue)s, %(_lastyearcumulativerevenue)s, %(_comparecumulativerevenue)s, %(_date)s)" )

		data_fs = {
			'_coid'				   : int(row[0][0]),
			'_monthlyrevenue'	   : int(MonthlyRevenue),
			'_lastmonthlyrevenue'  : int(LastMonthlyRevenue),
			'_lastyearmonthlyrevenue'  : int(LastYearMonthlyRevenue),
			'_monthlyincreaserevenue'  : float(MonthlyIncreaseRevenue),
			'_lastyearmonthlyincreaserevenue': float(LastYearMonthlyIncreaseRevenue),
			'_cumulativerevenue'		  : int(CumulativeRevenue),
			'_lastyearcumulativerevenue'  : int(LastYearCumulativeRevenue),
			'_comparecumulativerevenue'   : float(CompareCumulativeRevenue),
			'_date': date,}

		cursor.execute(add_fs, data_fs)
		db.commit()

	except mcon.Error as err:
		dbgPrint("InsertMonthlyRevenue: Connect to DB Error [" + str(err) + "] ")
		return(-1)

	dbgPrint("InsertMonthlyRevenue: Insert Completed: " + str(data_fs))
	return(0)
def InsertStockExchange(stockID, ExchangeVolume, StartPrice, HighPrice, LowPrice, EndPrice, Category,  date):
	if (stockID.strip()=="" or \
			ExchangeVolume.strip()=="" or StartPrice.strip()=="" or \
			HighPrice.strip()=="" or LowPrice.strip()=="" or		\
			EndPrice.strip()=="" or date.strip()==""):
		dbgPrint("InsertStockExchange: Parameters cannot be empty")
		return(-1)

	if ((stockID.lstrip('-+').isdigit() == False) and (not isfloat(stockID))):
		dbgPrint("InsertCalStatement: stockID must be a digit")
		return(-1)
	else:
		# Convert stockID from float to in then string
		stockID = str(int(float(stockID)))


	try:
		valid_date(date)
		
		# Get CoId
		cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,))
		row = cursor.fetchall()
		if(cursor.rowcount <= 0):
			dbgPrint("InsertStockExchange: Error: Cannot locate Company ID"  +str(stockID) +":"+ str(cursor.rowcount))
			return(-1)

		if(check_record(str(row[0][0]), date, Category, "StockExchange") != 0):
			dbgPrint("InsertStockExchange: Error: Record already exist, please make sure no duplicates")
			return(-1)

		add_fs = ("INSERT INTO StockExchange " \
					  "(CoId, ExchangeVolume, StartPrice, HighPrice, LowPrice, EndPrice, Category, Date) " \
				  "VALUES (%(_coid)s, %(_exchangevolume)s, %(_startprice)s, %(_highprice)s, \
					  %(_lowprice)s, %(_endprice)s, %(_category)s, %(_date)s)")

		data_fs = {
			'_coid': int(row[0][0]),
			'_exchangevolume': int(ExchangeVolume),
			'_startprice': float(StartPrice),
			'_highprice': float(HighPrice),
			'_lowprice': float(LowPrice),
			'_endprice': float(EndPrice),
			'_category': int(Category),
			'_date': date,}

		cursor.execute(add_fs, data_fs)
		db.commit()

	except mcon.Error as err:
		dbgPrint("InsertStockExchange: Connect to DB Error [" + str(err) + "] ")
		return(-1)

	dbgPrint("InsertStockExchange: Insert Completed: " + str(data_fs))
	return(0)
Exemple #15
0
def date_trans(datestr):
    # format - June 29, 2019
    def get_key(dict, value):
        return [k for k, v in dict.items() if v == value]

    # print(datestr)
    # print('@@@@@@@@@@@@@@@')
    month, date, year = datestr.split()
    date = date.strip(',')
    month = get_key(monthDict, month)[0]
    return str2date('{}-{}-{}'.format(year, month, date))
Exemple #16
0
 def format_date_us_history(strDate):
     if DateUtil.isVaildDate(strDate):
         return strDate
     tupDate = strDate.partition("|")
     chineseDate = tupDate[2] + ":00"
     date = str(chineseDate)
     date = date.replace("年", "-")
     date = date.replace("月", "-")
     date = date.replace("日", "")
     date = date.strip()
     return date
Exemple #17
0
    def get_date(self, response):
        matches = [
            'christophclarkonline', 'gapingangels', 'jakemalone',
            'joeysilvera', 'lewood', 'nachovidalhardcore', 'povblowjobs',
            'tittycreampies'
        ]
        if any(x in response.url for x in matches):
            date = response.xpath(
                '//script[contains(text(),"sceneReleaseDate")]').get()
            date = re.search('sceneReleaseDate\":\"(\\d{4}-\\d{2}-\\d{2})',
                             date).group(1)
        else:
            date = self.process_xpath(response,
                                      self.get_selector_map('date')).getall()
            if len(date) > 1:
                for daterow in date:
                    datetemp = ""
                    daterow.replace('Released:',
                                    '').replace('Added:', '').rstrip().strip()
                    if re.match('(\\d{4}-\\d{2}-\\d{2})', daterow):
                        datetemp = re.search('(\\d{4}-\\d{2}-\\d{2})',
                                             daterow).group(1).strip()
                    elif re.match('(\\d{2}-\\d{2}-\\d{4})', daterow):
                        datetemp = re.search('(\\d{2}-\\d{2}-\\d{4})',
                                             daterow).group(1).strip()
                    if not datetemp:
                        date = datetemp.strip()

        matches = ['21sextreme']
        if not date or any(x in response.url for x in matches):
            date = response.xpath(
                '//script[contains(text(),"sceneReleaseDate")]').getall()
            if len(date) > 1:
                for daterow in date:
                    datetemp = re.search(
                        'sceneReleaseDate\":\"(\\d{4}-\\d{2}-\\d{2})', daterow)
                    if datetemp:
                        datetemp = datetemp.group(1)
                        if datetemp:
                            date = datetemp.strip()

        if not date:
            date = response.xpath(
                '//div[@class="updatedDate"]/b/following-sibling::text()').get(
                )

        if not date:
            date = response.xpath(
                '//div[@class="updatedDate"]/b/following-sibling::text()').get(
                )

        return self.parse_date(date.strip(),
                               date_formats=['%m-%d-%Y',
                                             '%Y-%m-%d']).isoformat()
    def conver_time_to_epoch(date, format=None):
        date = date.strip()

        if format:
            try:
                calendar.timegm(datetime.strptime(date, format).timetuple())
            except:
                pass
        try:
            return calendar.timegm(datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timetuple())
        except:
            pass
        return ''
Exemple #19
0
 def __init__(self, id, name, date, oil, gas, water, injection, work_time):
     self.id = id.strip()
     if not len(self.id) == 9:
         raise ValueError('Некорректный id скважины')
     self.name = name.strip()
     if isinstance(date, str):
         self.date = date.strip()
         self.date = datetime.strptime(date, '%d%m%Y')
     self.oil = float(oil)
     self.gas = float(gas)
     self.water = float(water)
     self.injection = float(injection)
     self.work_time = float(work_time)
def InsertMarginTrade(stockID, MarginBuy, MarginSell, MarginRemine, ShortSellBuy, \
		ShortSellSell, ShortSellRemine, TotalVolume, ChargeOff, Category, date):
	if (stockID.strip()==""   or \
			MarginBuy.strip()=="" or MarginSell.strip()==""		  or \
			MarginRemine.strip()==""  or ShortSellBuy.strip()=="" or \
			ShortSellSell.strip()=="" or ShortSellRemine.strip()=="" or \
			TotalVolume.strip()==""   or ChargeOff.strip()==""	  or date.strip()==""):
		dbgPrint("InsertMarginTrade: Parameters cannot be empty")
		return(-1)

	try:
		valid_date(date)

		# Get CoId
		cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,))
		row = cursor.fetchall()
		if(cursor.rowcount <= 0):
			dbgPrint("InsertMarginTrade: Error: Cannot locate Company ID" +str(stockID) +":"+ str(cursor.rowcount))
			return(-1)

		if(check_record(str(row[0][0]), date, Category, "MarginTrading") != 0):
			dbgPrint("InsertMarginTrade: Error: Record already exist, please make sure no duplicates")
			return(-1) 

		add_fs = ("INSERT INTO MarginTrading (CoId, MarginBuy, MarginSell, MarginRemine, ShortSellBuy, " \
				"ShortSellSell, ShortSellRemine, TotalVolume, ChargeOff, Category, date) " \
			"VALUES (%(_coid)s, %(_marginbuy)s, %(_marginsell)s, %(_marginremine)s, %(_shortsellbuy)s, " \
				"%(_shortsellsell)s, %(_shortsellremine)s, %(_totalvolume)s, %(_chargeoff)s, %(_category)s, %(_date)s)")

		data_fs = {
			'_coid'		  : int(row[0][0]),
			'_marginbuy'  : int(MarginBuy),
			'_marginsell' : int(MarginSell),
			'_marginremine'  : int(MarginRemine),
			'_shortsellbuy'  : int(ShortSellBuy),
			'_shortsellsell'  : int(ShortSellSell),
			'_shortsellremine' : int(ShortSellRemine),
			'_totalvolume' : int(TotalVolume),
			'_chargeoff' : int(ChargeOff),
			'_category' : int(Category),
			'_date': date,}

		cursor.execute(add_fs, data_fs)
		db.commit()

	except mcon.Error as err:
		dbgPrint("InsertMarginTrade: Insert Error [" + str(err) + "] ")
		return(-1)

	dbgPrint("InsertMarginTrade: Insert Completed: " + str(data_fs))
	return(0)
Exemple #21
0
 def __init__(self, id, name, date, bhp, buff_pressure, annular_pressure,
              line_pressure, form_pressure):
     self.id = id.strip()
     if not len(self.id) == 9:
         raise ValueError('Некорректный id скважины')
     self.name = name.strip()
     if isinstance(date, str):
         self.date = date.strip()
         self.date = datetime.strptime(date, '%d%m%Y')
     self.bhp = float(bhp)
     self.buff_pressure = float(buff_pressure)
     self.annular_pressure = float(annular_pressure)
     self.line_pressure = float(line_pressure)
     self.form_pressure = float(form_pressure)
Exemple #22
0
def scraper_for_hours(date):
    """
    date
    """
    url = "http://menu.dining.ucla.edu/Hours" + "/" + date
    hours = {"hourDate": date.strip(), "hours": []}

    soup = BeautifulSoup(requests.get(url).text, "lxml")
    hour_table = soup.find("table", class_="hours-table")
    if hour_table == None:
        return hours
    elif hour_table.find("tbody") == None:
        return hours
    else:
        header_order_dict = {}
        counter = 0
        # header
        for tr in hour_table.find("thead"):
            for td in tr.find_all("th"):
                if td.string.strip() == "":
                    header_order_dict[str(counter)] = "hall_name"
                else:
                    title = td.string.strip().lower()
                    if "/" in title:
                        header_order_dict[str(counter)] = title.split("/")[0]
                    elif " " in title:
                        header_order_dict[str(counter)] = title.replace(
                            " ", "_")
                    else:
                        header_order_dict[str(counter)] = title

                counter = counter + 1

        # data
        for tr in hour_table.find("tbody").find_all("tr"):
            counter = 0
            dining_hall_dict = {}
            for td in tr.find_all("td"):
                if td.find("span") != None:
                    dining_hall_dict[header_order_dict[str(
                        counter)]] = td.find("span").string.strip()
                else:
                    dining_hall_dict[header_order_dict[str(
                        counter)]] = td.string.strip()

                counter = counter + 1

            hours["hours"].append(dining_hall_dict)

        return hours
Exemple #23
0
    def conver_time_to_epoch(date, format=None):
        date = date.strip()

        if format:
            try:
                calendar.timegm(datetime.strptime(date, format).timetuple())
            except:
                pass
        try:
            return calendar.timegm(
                datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timetuple())
        except:
            pass
        return ''
def InsertFinancialStatement(stockID, asset, equity, date):
	if (stockID.strip()=="" or asset.strip()=="" or equity.strip()=="" or date.strip()==""):
		dbgPrint("InsertFinancialStatement: Parameters cannot be empty")
		return(-1)

	if ((stockID.lstrip('-+').isdigit() == False) and (not isfloat(stockID))):
		dbgPrint("InsertCalStatement: stockID must be a digit")
		return(-1)
	else:
		# Convert stockID from float to in then string
		stockID = str(int(float(stockID)))

	if not (asset.isdigit and equity.isdigit):
		dbgPrint("InsertFinancialStatement: stockID, asset and equity must be numbers")
		return(-1)

	try:
		valid_date(date)

		# Get CoId
		cursor.execute("SELECT CoId FROM Company WHERE StockID=%s", (stockID,))
		row = cursor.fetchall()
		if(cursor.rowcount <= 0):
			dbgPrint("InsertFinancialStatement: Error: Cannot locate Company ID" + str(cursor.rowcount))
			return(-1)
			   
		if(check_record(str(row[0][0]), date, "", "FinancialStatement") != 0):
			dbgPrint("InsertFinancialStatement: Error: Record already exist, please make sure no duplicates")
			return(-1)

		add_fs = ("INSERT INTO FinancialStatement " \
			"(CoId, TotalAsset, TotalEquity, Date) " \
			"VALUES (%(_coid)s, %(_asset)s, %(_equity)s, %(_date)s)")

		data_fs = {
						'_coid': int(row[0][0]),
			'_asset': int(asset),
			'_equity': int(equity),
			'_date': date,}

		cursor.execute(add_fs, data_fs)
		db.commit()

	except mcon.Error as err:
		dbgPrint("InsertFinancialStatement: Connect to DB Error [" + str(err) + "] ")
		return(-1)

	dbgPrint("InsertFinancialStatement: Insert Completed: " + str(data_fs))
	return(0)
Exemple #25
0
def populate_date(date):
    """ Try to find date patterns and convert them to default one. """
    v = date.strip()
    patterns = [
        (r"^D:(\d\d\d\d)(\d\d)(.*)$", r"\1-\2"),
        (r"^(\d\d\d\d)(\d\d)(.*)$", r"\1-\2"),
        (r"^(\d+)/\d+/(\d\d+) \d\d(.*)$", r"\2-\1"),
        (r"^.* (\d\d\d\d)(?:\W+.*$|$)", r"\1-??"),
    ]

    for (pat, repl) in patterns:
        v_new = re.sub(pat, repl, v)
        if v != v_new:
            return v_new.replace("-0", "-")  # remove 0 from month
    return "??"  # better to avoid javascript problems and indicate error
Exemple #26
0
def populate_date(date):
    """ Try to find date patterns and convert them to default one. """
    v = date.strip()
    patterns = [
        (r"^D:(\d\d\d\d)(\d\d)(.*)$", r"\1-\2"),
        (r"^(\d\d\d\d)(\d\d)(.*)$", r"\1-\2"),
        (r"^(\d+)/\d+/(\d\d+) \d\d(.*)$", r"\2-\1"),
        (r"^.* (\d\d\d\d)(?:\W+.*$|$)", r"\1-??"),
    ]

    for (pat, repl) in patterns:
        v_new = re.sub(pat, repl, v)
        if v != v_new:
            return v_new.replace("-0", "-")  # remove 0 from month
    return "??"  # better to avoid javascript problems and indicate error
Exemple #27
0
def generate_payout_dates_cd(
        payout_file_path,
        trading_date_file,
        expected_seg="NseCD",
        year=2020):  ##Generate future payout dates till given year
    sanity_check(payout_file=payout_file_path,
                 trading_date_file=trading_date_file,
                 matched_with=expected_seg)
    nse_cd_dates = defaultdict(
        dict)  ## dict(year:(dict(month:[list of business days])))
    _, last_dt = string_to_date(
        extract_last_line(payout_file_path).strip("\n"))

    with open(trading_date_file, "r") as f:
        f.readline()  ##Skipping first line
        for date in f:
            date = date.strip("\n")  ## removing extra "\n" from date
            _, dt = string_to_date(date)
            if dt > last_dt:  ## adding all those date which is greter than last date in given payout file
                _year, month, day = date.split("-")
                month_dict = nse_cd_dates[
                    _year]  ## month -> [list of trading days]
                if month not in month_dict:
                    month_dict[month] = [day]
                else:
                    month_dict[month] += [day]
                nse_cd_dates[_year].update(month_dict)

    result = []
    with open(payout_file_path, "a") as fout:
        for _year in nse_cd_dates:
            if int(_year) <= year:
                for month in nse_cd_dates[_year]:
                    if int(
                            month
                    ) > last_dt.month:  ## Add date Only if month is greater than last appended date
                        fout.write(_year + "-" + month + "-" +
                                   nse_cd_dates[_year][month][-3] + "\n")
                        result += [
                            _year + "-" + month + "-" +
                            nse_cd_dates[_year][month][-3]
                        ]
    logging.debug("appended dates: %s" % result)
    return result
Exemple #28
0
 def __init__(self,well_id, date, buff_pressure, annular_pressure, line_pressure):
     self.well_id = well_id.strip()
     try:
         self.date = date.strip()
         self.date = datetime.strptime(date, '%d.%m.%Y')
     except:
         self.date=date
     try:
         self.annular_pressure = float(annular_pressure)
     except:
         self.annular_pressure = 0 
     try:        
         self.buff_pressure = float(buff_pressure)
     except:
         self.buff_pressure = 0
     
     try:
         self.line_pressure = float(buff_pressure)      
     except:
         self.line_pressure = 0
Exemple #29
0
    def __init__(self,well_id, date, bhp, form_pressure1, form_pressure2):
        self.well_id = well_id.strip()
        try:
            self.date = date.strip()
            self.date = datetime.strptime(date, '%d.%m.%Y')
        except:
            self.date=date     
        try:
            self.bhp = float(bhp)
        except:
            self.bhp = 0
        try:        
            self.form_pressure1 = float(form_pressure1)
        except:
            self.form_pressure1 = 0

        try:        
            self.form_pressure2 = float(form_pressure2)
        except:
            self.form_pressure2 = 0    
def check_record(sID, date, category, table):
	try:
		if(date.strip() == ""):
			cursor.execute( \
			"SELECT COUNT(1) FROM " + table + " WHERE StockID = %s limit 1", (sID,))
		else:
			if(category.strip() == ""):
				cursor.execute( \
				"SELECT COUNT(1) FROM " + table + " WHERE CoId = %s and Date = %s limit 1", (sID, date))
			else:
				 cursor.execute( \
                 "SELECT COUNT(1) FROM " + table + " WHERE CoId = %s and Date = %s and Category = %s limit 1", (sID, date, category))

		res = cursor.fetchone()
		if res[0] > 0:
			dbgPrint("cehck_record: Error: Record already exist")
			return(-1)

	except mcon.Error as err:
		 dbgPrint("cehck_record: DB Error, table[" + table + "] err [" + str(err) + "] ")
		 return(-1)

	dbgPrint("check_record Completed: table [" + table + "] [" + str(sID) + "]")
	return(0)
def build_input_data(date, hour_hh, muni_indices):

    # Check NOAA if data is complete.
    data_finished(hour_hh.strip(), date.strip())
    url_list = create_grib_url_list(date, hour_hh)
    file_names = download_all_grib(url_list)

    # Initialize GroupedArray named muni_data_bank.
    muni_data_bank = GroupedArray(muni_indices.keys())

    # For each downloaded file, fill muni_data_bank with extracted data.
    for each_file in tqdm(file_names,
                          total=len(file_names),
                          desc="Parsing grib files"):
        if not parse_grib(each_file, muni_indices, muni_data_bank):
            send_error_email(ERROR_NAM_MESSAGE_1 %
                             get_path_dir('input_data', 'grib_test.grib2'))
            raise Exception(
                'grib_grab shouldn\'t fail if data_finished method succeeds. Check data for %s'
                % "00")

    output_str = write_json_data(muni_data_bank, hour_hh)

    return output_str
Exemple #32
0
def iso8601date(date, format=None):
    """Convert a date to ISO8601 date format

input format: YYYY-MM-DD HH:MM:SS GMT (works less reliably for other TZs)
or            YYYY-MM-DD HH:MM:SS.0
or            YYYY-MM-DD
or            epoch (13 digit, indicating ms)
or            epoch (10 digit, indicating sec)
output format: iso8601

"""
    date = date.strip()
    
    if format:
        try:
            return datetime.strptime(date, format).isoformat()
        except Exception:
            pass

    try:
        return datetime.strptime(date, "%Y-%m-%d %H:%M:%S %Z").isoformat()
    except Exception:
        pass

    try:
        return datetime.strptime(date, "%A, %b %d, %Y").isoformat()
    except Exception:
        pass

    try:
        return datetime.strptime(date, "%Y-%m-%d %H:%M:%S.0").isoformat()
    except:
        pass

    try:
        return datetime.strptime(date, "%Y-%m-%d").isoformat()
    except:
        pass

    try:
        return datetime.strptime(date, "%b %d, %Y").isoformat()
    except:
        pass

    try:
        return datetime.strptime(date, "%B %d, %Y").isoformat()
    except:
        pass

    try:
        return datetime.strptime(date, "%B %d, %Y %I:%M %p").isoformat()
    except:
        pass
        
    try:
        date = int(date)
        if 1000000000000 < date and date < 9999999999999:
            # 13 digit epoch
            return datetime.fromtimestamp(mktime(gmtime(date / 1000))).isoformat()
    except:
        pass

    try:
        date = int(date)
        if 1000000000 < date and date < 9999999999:
            # 10 digit epoch
            return datetime.fromtimestamp(mktime(gmtime(date))).isoformat()
    except:
        pass
    # If all else fails, return input
    return ''
 def _set_date(self, date):
     if isinstance(date, datetime):
         self.db_date = date
     elif isinstance(date, basestring) and date.strip() != '':
         newDate = datetime(*strptime(date, '%d %b %Y %H:%M:%S')[0:6])
         self.db_date = newDate
Exemple #34
0
 def _get_case_dates(self):
     path = "{base}/td[1]/text()".format(base=self.base)
     return [convert_date_string(date.strip()) for date in self.html.xpath(path) if date.strip()]
def scrape_links(links):
     maincleaner = Cleaner(allow_tags=['div'], remove_unknown_tags=False, remove_tags=['div'])     # funtion to remove every tag

#    while True:
     for link in links:            # Loop through all the links
        if link == last_link:      # Check if this link has already been scraped (this will eventually be changed to check dates)
            break                  # If we've hit something we've already scraped, break out of the loop
#        try:
        linkhtml = scraperwiki.scrape(link).decode('latin_1')          # scrape the contents of the current link and decode from Windows-1252 encoding
        print link
        root = lxml.html.fromstring(linkhtml)                               # turn scraped content into an HTML object

        # GET TITLE
        title = root.cssselect("h1")[0].text.encode('utf-8')                # grab the page header (title) and return its text as unicode
        title = replace_all(title, subDic)                                  # replace alphanumeric obfuscations with letters

        # GET DATE
        date = root.cssselect("div.adInfo")[0].text                         # get the text of the html entity that contains the date and time of the post
        cleandate = re.sub(r'(\S+\s+\d+,\s+\d\d\d\d)(?:,?) (\d+\:\d+ \w\w)', r'\1 \2', date.strip())  # get date into a standard format
        cleandate = re.search(r'\S+\s+\d+, \d\d\d\d \d+\:\d+ \w\w', cleandate).group(0) # find the date string on the page
        rawdate = datetime.strptime(cleandate,'%B %d, %Y %I:%M %p')                 # encode the date as a date using format Month dd, YYYY
        date = rawdate.strftime('%Y-%m-%d %H:%M')                        # decode that date back into a string of format YYYY-mm-dd

        # GET MAIN BODY TEXT
        mainwithtags = root.cssselect("div.postingBody")[0]                # grabs the body text of the post
        main = maincleaner.clean_html(mainwithtags).text.encode('utf-8')            # gets rid of all HTML tags
        main = replace_all(main, subDic)                                            # replace alphanumeric obfuscations with letters

        # GET PHONE NUMBER(S)
        stripped = replace_all(main.lower(), wordDic)                               # replaces common phone number obfuscations with actual numbers
        phonecomp = re.compile("[\s\-/=\.,{}_\!\@\#\$\%\^\&\*\(\)\~]")      # list of known phone number dividers
        stripped = phonecomp.sub('',stripped)                               # remove phone number dividers
        phone = re.findall(r'(?:1?)[1-9]\d{9}',stripped)                    # search for groups of 10 consecutive numbers (with an optional preceding 1)
        phone = list(set(phone))                                            # gets rid of duplicate numbers by turning list into a set and back
        phone = ", ".join(phone)                                            # formats phone numbers as "phone1, phone2,... phoneN"
        
        # GET LISTED AGE
        if root.cssselect("p.metaInfoDisplay"):                             # does the entry have metainfo?
            listedage = root.cssselect("p.metaInfoDisplay")[0]              # get the the first html metainfo element
            listedage = re.sub("[^\d]","",listedage.text)                   # get rid of all non-numeric text in the text of the element
        else:                                                               # if there's no metainfo
            listedage = ""                                                  # set the listed age to an empty string

        # GET LOCATION
        if re.findall(r'Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL):  # 
            location = re.findall('Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL)[0].encode('utf-8')
#            location = removeNonAscii(location)
            #if any(x in NEIGHBORHOODS) in location:
             #   print x, 'x'
              #  area =  x
            area = None
            for neighborhood in NEIGHBORHOODS:
                if neighborhood in location.lower():
                    area = neighborhood

            print repr(area)
            print repr(location)
        else:
            location = ""

        picturelist=[]
        pictures = root.cssselect('ul#viewAdPhotoLayout img')
        for i in range(len(pictures)):
            largepic = re.sub('/medium/','/large/',pictures[i].get('src'))
            picturelist.append(largepic)
        print picturelist 
        picturelist = " ".join(picturelist)
        x = urllib.urlopen(largepic).read()
        piccode = base64.encodestring(x)
        print piccode
        
#        except:
#            print 'FAILED TO LOAD: ' + link
#        continue
#            record = {}
#            record['Title'] = 'LOAD FAILURE'
        # Set up our data record - we'll need it later

        record = {}
        record['Title'] = title #.encode('ascii', 'ignore').strip()
        record['Date'] = date
        record['Main'] = main #.encode('ascii', 'ignore').strip()
        record['Pictures'] = picturelist
        record['Phone'] = phone
        record['Listed Age'] = listedage #.encode('ascii', 'ignore').strip()
        record['Location'] = location
        record['area']= area
        record['PicCode'] = piccode #.encode('ascii', 'ignore').strip()
            # Print out the data we've gathered
           #print record, '------------'
            # Finally, save the record to the datastore - 'Artist' is our unique key
        scraperwiki.sqlite.save(["Title"], record)
        time.sleep(2)
    def handle(self, *args, **options):
        base_url = "http://www.dsca.mil/"
        results = []

        # create list of links to search
        links2archives = []
        year = int(datetime.strftime(datetime.today().date(), "%Y"))
        month = int(datetime.strftime(datetime.today().date(), "%m"))

        if month == 1:
            month_limit = 12
            year_limit = year - 1
        else:
            month_limit = month - 1
            year_limit = year

        # set to 2008 for full records
        while year >= year_limit:
            while month >= month_limit:
                if year == 2008 and month == 05:
                    break
                if len(str(month)) < 2:
                    month_format = "0" + str(month)
                else:
                    month_format = str(month)
                link = "http://www.dsca.mil/major-arms-sales/archives/" + str(
                    year) + month_format
                links2archives.append(link)
                month = month - 1
            month = 12
            year = year - 1

        # find titles and links to pages
        for link in links2archives:
            print "working on ", link
            archive_page = soupify(link)
            archive_body = archive_page.select(".view-content")
            # there are not entries for every month
            try:
                archive_body = archive_body[1]
            except IndexError:
                continue
            info = archive_body.select(".mas-regions")

            # find info for each
            for profile in info:
                links2pages = profile.find_all("a")
                pagelink = links2pages[0].get("href")
                pagelink = base_url + pagelink

                try:
                    existing_record = Proposed.objects.get(dsca_url=pagelink)
                    print "exists"
                except:
                    title = links2pages[0].text
                    date_p = profile.find_all("div")[-1]
                    date_p = date_p.text

                    if "Defense Security Cooperation Agency\n" in date_p:
                        date_p = date_p.replace(
                            "Defense Security Cooperation Agency\n", "")

                    date = date_p.split(u"–")
                    date = date[0]
                    date = date.replace("WASHINGTON, ", "")
                    date = date.strip()

                    if len(date) > 25:
                        date = date_p.split("-")
                        date = date[0]
                        date = date.replace("WASHINGTON, ", "")
                        date = date.strip()

                    if len(date) > 25:
                        date = date_p.split("--")
                        date = date[0]
                        date = date.replace("WASHINGTON, ", "")
                        date = date.strip()

                    try:
                        date_obj = datetime.strptime(date, "%b %d, %Y")
                    except:
                        if "Sept." in date or "Sept " in date:
                            date = date.replace("Sept", "Sep")

                        try:
                            date_obj = datetime.strptime(date, "%b. %d, %Y")
                        except:
                            pass
                        try:
                            date_obj = datetime.strptime(date, "%B %d, %Y")
                        except:
                            date_obj = None

                    # looking at individual page
                    page = soupify(pagelink)

                    # a few don't have pdfs
                    try:
                        pdf_link = page.select(".file")[0].find_all("a")
                    except:
                        pdf_link = None

                    if pdf_link != None:
                        pdf_link = pdf_link[0].get("href")

                    data_text = ''
                    field_text = page.select(".field-item")
                    for d in field_text:
                        data_text = data_text + "\n" + d.text

                    record = Proposed(
                        title=title,
                        text=data_text,
                        date=date_obj,
                        dsca_url=pagelink,
                        pdf_url=pdf_link,
                    )

                    country = title.split(u"–")
                    if len(country) <= 1:
                        country = title.split(u"-")

                    country = country[0]
                    country = country.replace("Government of ", "")
                    country = country.replace("The ", "")
                    country = country.strip()

                    cleaning = {
                        "Iraq F":
                        "Iraq",
                        "Republic of Korea":
                        "South Korea",
                        "Republic of Korea (ROK)":
                        "South Korea",
                        "United Arab Emirates (UAE)":
                        "United Arab Emirates",
                        "Taipei Economic and Cultural Representative Office in the United States":
                        "Taiwan",
                        "Kingdom of Morocco":
                        "Morocco"
                    }
                    if cleaning.has_key(country):
                        country = cleaning[country]

                    try:
                        matching_loc = Location.objects.get(location=country)
                        loc_id = int(matching_loc.id)
                        record.location_id = loc_id
                        record.location = matching_loc.location
                        print loc_id
                    except:
                        matching_loc = None

                    record.save()
                    print "added record %s" % (record)

                    # #save to amazon
                    try:
                        file_name = "arms_pdf/" + str(record.id) + ".pdf"
                        pdf_link = str(pdf_link)
                        u = urllib2.urlopen(pdf_link)
                        localFile = default_storage.open(file_name, 'w')
                        localFile.write(u.read())
                        localFile.close()

                    except:
                        print 'not working'
                        message = 'bad upload ' + title
                        logger.error(message)

                    results.append({
                        "title": title,
                        "date": date,
                        "link": pagelink,
                        "pdf_link": pdf_link,
                        "text": data_text
                    })

                    try:
                        doc = {
                            'title': title,
                            'text': data_text,
                            'location': record.location,
                            'location_id': record.location_id,
                            'date': record.date,
                        }
                        print "made doc"
                        res = es.index(index="foreign",
                                       doc_type='arms',
                                       id=record.id,
                                       body=doc)
                    except:
                        message = 'bad pdf no elasticsearch upload for - %s' % (
                            title)
                        logger.error(message)

                    print title
Exemple #37
0
	def handle(self, *args, **options):
		base_url = "http://www.dsca.mil/"
		results= []
		
		# create list of links to search
		links2archives = []
		year = int(datetime.strftime(datetime.today().date(), "%Y"))
		month = int(datetime.strftime(datetime.today().date(), "%m"))

		if month == 1:
			month_limit = 12
			year_limit = year - 1
		else:
			month_limit = month - 1
			year_limit = year

		# set to 2008 for full records
		while year >= year_limit:
			if year == 2008 and month == 05:
					break
			while month >= month_limit:
				if year == 2008 and month == 05:
					break
				if len(str(month)) < 2:
					month_format = "0" + str(month)
				else:
					month_format = str(month)
				link = "http://www.dsca.mil/major-arms-sales/archives/" + str(year) + month_format
				links2archives.append(link)
				month = month - 1
			month = 12
			year = year - 1

		# find titles and links to pages
		for link in links2archives:
			print "working on ", link
			archive_page = soupify(link)
			archive_body = archive_page.select(".view-content")
			# there are not entries for every month
			try:
				archive_body = archive_body[1]
			except IndexError:
				continue		
			info = archive_body.select(".mas-regions")

			# find info for each
			for profile in info:
				links2pages = profile.find_all("a")
				pagelink = links2pages[0].get("href")
				pagelink = base_url + pagelink
				
				try: 
					existing_record = Proposed.objects.get(dsca_url=pagelink)
					
				except:
					title = links2pages[0].text
					date_p = profile.find_all("div")[-1]
					date_p = date_p.text
					
					if "Defense Security Cooperation Agency\n" in date_p:
						date_p = date_p.replace("Defense Security Cooperation Agency\n", "")

					date = date_p.split(u"–")
					date = date[0]
					date = date.replace("WASHINGTON, ", "")
					date = date.strip()

					if len(date) > 25:
						date = date_p.split("-")
						date = date[0]
						date = date.replace("WASHINGTON, ", "")
						date = date.strip()

					if len(date) > 25:
						date = date_p.split("--")
						date = date[0]
						date = date.replace("WASHINGTON, ", "")
						date = date.strip()

					try:
						date_obj = datetime.strptime(date, "%b %d, %Y")
					except:
						if "Sept." in date or "Sept " in date:
							date = date.replace("Sept", "Sep")

						try:	
							date_obj = datetime.strptime(date, "%b. %d, %Y")
						except:
							pass
						try:
							date_obj = datetime.strptime(date, "%B %d, %Y")
						except: 
							date_obj = None
					
					# looking at individual page
					page = soupify(pagelink)
					print_link = page.select(".print_html")[0].find_all("a")
					print_link = print_link[0].get("href")
					
					# a few don't have pdfs
					try:
						pdf_link = page.select(".file")[0].find_all("a")
					except:
						pdf_link = None

					if pdf_link != None:
						pdf_link = pdf_link[0].get("href")
					
					print_page = soupify(print_link)
					data_text = print_page.select(".print-content")[0] 
					data_text = data_text.text
					record = Proposed(
						    title = title,
						    text = data_text,
						    date = date_obj,
						    dsca_url = pagelink,
						    pdf_url = pdf_link,
						    print_url = print_link,
						)
					
					country = title.split(u"–")
					if len(country) <= 1:
						country = title.split(u"-")
					
					country = country[0]
					country = country.replace("Government of ", "")
					country = country.replace("The ", "")
					country = country.strip()

					cleaning = {"Iraq F":"Iraq", "Republic of Korea":"South Korea", "Republic of Korea (ROK)":"South Korea", "United Arab Emirates (UAE)":"United Arab Emirates", "Taipei Economic and Cultural Representative Office in the United States":"Taiwan", "Kingdom of Morocco":"Morocco"}
					if cleaning.has_key(country):
						country = cleaning[country]

					try:
						matching_loc = Location.objects.get(location=country)
						loc_id = int(matching_loc.id)
						record.location_id = loc_id
					except:
						matching_loc = None
					
					record.save()

					#save to amazon
					try:
						file_name = "arms_pdf/" + str(record.id) + ".pdf"
						pdf_link = str(pdf_link)
						u = urllib2.urlopen(pdf_link)
						localFile = default_storage.open(file_name, 'w')
						localFile.write(u.read())
						localFile.close() 

					except:
						print 'not working'
						message = 'bad upload ' + title
						logger.error(message)
							
					results.append({"title":title, "date":date, "link": pagelink, "pdf_link":pdf_link, "print_link":print_link, "text": data_text})
					print title	
Exemple #38
0
job_listings = soup.find_all('div', {'class': 'job-listing-job-item'})

for job_listing in job_listings:
    job_description = job_listing.find_all('span')
    # Get job title and link
    job_title = job_description[0].a.text
    info_link = 'https://recruiting.paylocity.com' + job_description[0].a[
        'href']
    job_summary = info_link
    # Get date as string
    date = job_description[1].text
    # Clean up date string by removing trailing -'s, then split and convert to datetime object
    if date[len(date) - 2] == '-':
        date = date[0:len(date) - 3]
    date = date.strip().split('/')
    month = int(date[0])
    day = int(date[1])
    year = int(date[2])
    job_post_date = datetime(year, month, day)
    # Get Location
    job_location = job_listing.find('div', {
        'class': 'location-column'
    }).span.text
    # Get soup of job listing to scrape more info
    listing_soup = get_soup(info_link)
    listing_body = listing_soup.find('body').find_all('p')
    # Retrieve Full/Part-time and Salary info if available
    if 'Location' in listing_body[0].text:
        location_string = listing_body[0].text.split(':')[1].lstrip()
        zip_code_result = re.search(r'(\d{5})', location_string)
Exemple #39
0
 def parse_date(date, lang):
     d = dateparser.parse(date.strip(), languages=[lang])
     return d
Exemple #40
0
def parse_pub_dates(request):
    return [ date.strip() for date in  request.POST["pub_date"].split("\n") if date.strip() != "" ]
Exemple #41
0
def render():
    global stages, teams, places, rounds, space, games, places, dic_slice_2_games, dates, tournaments, tournamentPos, goals
    sliceId = 0
    shares = {
        "teams": 350,
        "calendar": 600 // 3,
        "places": 650 // 3,
        "stages": 400 - 650 // 3
    }
    space = (1000 - (shares["teams"] + shares["calendar"] + shares["places"] +
                     shares["stages"])) // 4
    calendar = []
    #список команд
    for t in teams:
        t["value"] = shares["teams"] / len(teams)
        t["color"] = "#4daa4b"
        t["sliceId"] = sliceId
        t["id_group"] = 0
        teams_name_dic[str(t["id"])] = t["name"]
        dic_sliceId[sliceId] = 0
        dic_name2sliceId[str(t["id"])] = sliceId
        dic_sliceId2name[sliceId] = t["id"]
        sliceId += 1

    sliceId += 1  #для пространства
    #календарь игр
    for date in dates:
        c = {}
        strTime = getNormalDate(date.strip())
        Time = datetime.strptime(strTime.strip(), '%Y-%m-%d')
        c["value"] = shares["calendar"] / len(dates)
        c["name"] = Time.strftime(" %d %B") + " " + Time.strftime(
            "%A")[0:3] + "."
        c["color"] = "#ddea4f"
        c["sliceId"] = sliceId
        c["id_group"] = 1
        dic_sliceId[sliceId] = 1
        dic_name2sliceId[strTime] = sliceId
        dic_sliceId2name[sliceId] = strTime
        sliceId += 1
        calendar.append(c)

    sliceId += 1  #для пространства
    #стадион + город
    for p in places:
        p["name"] = p["stadium"].split("|")[0] + ";" + p["city"]
        p["value"] = shares["places"] / len(places)
        p["color"] = "#4a69a9"
        p["sliceId"] = sliceId
        p["id_group"] = 2
        dic_sliceId[sliceId] = 2
        dic_name2sliceId[p["id"]] = sliceId
        dic_sliceId2name[sliceId] = p["id"]
        sliceId += 1
    sliceId += 1  #для пространства
    #раунды
    for s in stages:
        s["value"] = shares["stages"] / len(stages)
        s["color"] = "#a89449"
        s["sliceId"] = sliceId
        s["id_group"] = 3
        dic_sliceId[sliceId] = 3
        dic_name2sliceId["s" + str(s["id"])] = sliceId
        dic_sliceId2name[sliceId] = "s" + str(s["id"])
        sliceId += 1

    #какие игры показываем при клике
    for i in range(sliceId):
        dic_slice_2_games[i] = []

    click_events = []
    for curSlice in range(sliceId):
        click_events.append({
            "key": curSlice,
            "value": getConnectionBySliceId(curSlice)
        })

    slice_name = []
    for d in dic_slice_2_games:
        slice_name.append({"key": d, "value": dic_slice_2_games[d]})

    dic_slice_2_games = {}
    return render_template("world_cup2.html",
                           teams=teams,
                           rounds=calendar,
                           places=places,
                           stages=stages,
                           space=space,
                           outGroups=outGroups,
                           click_events=click_events,
                           games_clear=games_clear,
                           slice_name=slice_name,
                           games_playoff=games_playoff,
                           tournaments=tournaments,
                           tournamentPos=tournamentPos,
                           goals=goals)
    def iso8601date(date, date_format=None):
        """Convert a date to ISO8601 date format
        input format: YYYY-MM-DD HH:MM:SS GMT (works less reliably for other TZs)
        or            YYYY-MM-DD HH:MM:SS.0
        or            YYYY-MM-DD
        or            epoch (13 digit, indicating ms)
        or            epoch (10 digit, indicating sec)
        output format: iso8601"""
        date = date.strip()

        if date_format:
            try:
                return datetime.strptime(date, date_format).isoformat()
            except Exception:
                pass

        try:
            return datetime.strptime(date, "%Y-%m-%d %H:%M:%S").isoformat()
        except:
            pass

        try:
            # Friday, October 2, 2015 1:35 AM
            return datetime.strptime(date, "%A, %B %d, %Y %I:%M %p").isoformat()
        except:
            pass

        try:
            # Friday, 2 October 2015, 18:23
            return datetime.strptime(date, "%A, %d %B %Y, %H:%M").isoformat()
        except:
            pass

        try:
            # Thu October 01st, 2015
            return datetime.strptime(date, "%a %B %dst, %Y").isoformat()
        except:
            pass

        try:
            # Thu October 02nd, 2015
            return datetime.strptime(date, "%a %B %dnd, %Y").isoformat()
        except:
            pass

        try:
            # Thu October 03rd, 2015
            return datetime.strptime(date, "%a %B %drd, %Y").isoformat()
        except:
            pass

        try:
            # Thu October 04th, 2015
            return datetime.strptime(date, "%a %B %dth, %Y").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%Y-%m-%d %H:%M:%S %Z").isoformat()
        except Exception:
            pass

        try:
            return datetime.strptime(date, "%A, %b %d, %Y").isoformat()
        except Exception:
            pass

        try:
            return datetime.strptime(date, "%Y-%m-%d %H:%M:%S.0").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%Y-%m-%d").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%b %d, %Y").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%B %d, %Y").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%B %d, %Y %I:%M %p").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%b %d, %Y at %I:%M %p").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%m-%d-%Y").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").isoformat()
        except:
            pass

        try:
            return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").isoformat()
        except:
            pass

        try:
            date = int(date)
            if 1000000000000 < date < 9999999999999:
                # 13 digit epoch
                return datetime.fromtimestamp(mktime(gmtime(date / 1000))).isoformat()
        except:
            pass

        try:
            date = int(date)
            if 1000000000 < date < 9999999999:
                # 10 digit epoch
                return datetime.fromtimestamp(mktime(gmtime(date))).isoformat()
        except:
            pass
        # If all else fails, return empty
        return ''
Exemple #43
0
    def _set_date(self, date):
        if type(date) == datetime:
            self.db_date = date
        elif type(date) == type('') and date.strip() != '':
            newDate = datetime(*strptime(date, '%d %b %Y %H:%M:%S')[0:6])
	    self.db_date = newDate
Exemple #44
0
def odd_link(b, date, l, directory):
  text = b.get_text()
  # not links to docs
  try:
    link = l.get("href")
  except:
    pass

  # these are not documents
  if "link" in locals():
    if link[-4:] == ".gov":
      return {"date_string":False, "real_title":False}
    elif link[-5:] == ".gov/" or link == "/usao/eousa/index.html":
      return {"date_string":False, "real_title":False}
  text = b.get_text()

  #section for documents without dates:
  if date != None:
    if date.strip() == "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995":
      return {"date_string": "June 1, 1996", "real_title": "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995"}
    if date == "Audit Report GR-30-00-001":
      return {"date_string": "November 1, 2000", "real_title":"McMechen, West Virginia Police Department, Audit Report GR-30-00-001"}
    # no date, one other entry, giving it the same date
    if date == "Georgia's Department of Corrections":
      return {"date_string": "November 1, 2000", "real_title":"United States Marshals Service Cost Proposal for the Intergovernmental Service Agreement for Detention Facilities with the City of Atlanta, Georgia’s Department of Corrections"}
    # confirmed no dates for these
    no_dates = ("Audit Report GR-40-99-014", "Audit Report GR-40-99-011", "Evaluation and Inspections Report I-2000-021", "Evaluation and Inspections Report I-2000-018", "Audit Report 99-03")
    if date.strip() in no_dates:
      date_string = datetime.now()
      date_string = datetime.strftime(date_string, "%B %d, %Y")
      return {"date_string": date_string, "real_title": text}
    # Intergovernmental Agreements for Detention Space External Reports don't always have dates, not even on the documents, using today
    if directory == "Intergovernmental Agreements for Detention Space (IGAs)":
      date_string = datetime.now()
      date_string = datetime.strftime(date_string, "%B %d, %Y")
      return {"date_string": date_string, "real_title": text}

  # need to get rid of this to process
  if "Released Publicly" in text:
    date = text
    date = re.sub(r'\([^)]*\)', '', date)
    date = re.sub(r'\[(.*?)\]', '', date)
    date = date.replace("Released Publicly", '')
    date_chopped = date.rsplit(',')
    day = date_chopped[-1]
    date = day.strip()
    if day.isdigit():
        date_string = date_chopped[-2] + "," + date_chopped[-1]
    if "," not in date:
      date = date.strip()
      date = date.replace(" ", " 1, ")
    return{"date_string": date, "real_title": text}

  if "Revised" in text:
    date = text
    date = re.sub(r'\([^)]*\)', '', date)
    date = re.sub(r'\[(.*?)\]', '', date)
    date = date.replace("Revised", '')
    date_chopped = date.rsplit(',')
    day = date_chopped[-1]
    date = day.strip()
    if day.isdigit():
        date_string = date_chopped[-2] + "," + date_chopped[-1]
    if "," not in date:
      date = date.strip()
      date = date.replace(" ", " 1, ")
    return{"date_string": date, "real_title": text}

  if date != None:
    date = date.strip

    # case 1, date is wrong because it is in the paragraph and completely written out
    try:
        date =  b.string
        date_string = date_format(date)
        title = b.string
    except:
      # these are lists of links that are different variants of the same report in a list
      # case where there is a list in a paragraph tag
      listy = b.parent.parent
      text = str(listy.previous_sibling)
      title = text

      # case where there is a paragraph above a list
      if len(text) < 4:
        listy = b.parent.parent
        text = listy.previous_sibling.previous_sibling
        title = str(text)[3:-4]
      date = re.sub(r'\([^)]*\)', '', title)
      date = re.sub(r'\[[^)]*\]', '', date)
      date = date.rsplit(',')
      date_string = date[-1]
      date_string = date_string.strip()
      if "," not in date_string:
        date_string = date_string.replace(" ", " 1, ")

  # for the DOJ combined page
  if date_string == 'id="content" 1, name="content">':
    text = b.text
    text = re.sub(r'\([^)]*\)', '', text)
    chunks = text.split(",")
    day_piece = chunks[-1]
    day_chunks = day_piece.split('—')
    day = day_chunks[0]
    day = day.strip()
    day = day.replace(" ", " 1, ")
    date_string = day
    title = b.text

  ## uncomment for debugging
  # try:
  #   date = datetime.strptime(date_string, "%B %d, %Y")
  # except:
  #   print('hit one')
  #   print("b:  ", b.text)
  #   print("l:  ", l)
  #   print("date: ", date)
  #   print("date string", date_string)
  #   print("directory", directory)
  #   exit()

  info = {"real_title":title, "date_string": date_string, }
  return(info)