Example #1
0
 def keyword_traffics(self, filters, start_date, end_date):
     if not self.account: raise GoogleException('Please set account id.', 504)
     if isinstance(start_date, (str, unicode)): start_date = dateParse(start_date)
     if isinstance(end_date, (str, unicode)): end_date = dateParse(end_date)
     if isinstance(filters, (str, unicode)): filters = eval(filters)
     data = self.account.get_data(start_date, end_date, metrics=['visits',], dimensions=['keyword', ], filters=filters, sort=['-visits',], max_results=GoogleAnalytics.MAX_RESULTS)
     return data
Example #2
0
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'meta[property="article:published_time"]::attr(content)').
            extract_first()).replace(tzinfo=None)
        try:
            modified_time = dateParse(
                response.css(
                    'meta[property="article:modified_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            return None

        for item in (
                response.css('body::attr(class)').extract_first()).split(' '):
            if 'postid' in item:
                id_constructor = item.split('-')

        qmfashionItem = QmfashionItem(
            _id='siddysays' + '-' + id_constructor[len(id_constructor) - 1],
            published_time=published_time,
            modified_time=modified_time,
            url=response.request.url,
            title=response.css('.title a::text').extract_first(),
            opening_text=extract_first_paragraph(
                response, "div.blogpost div.posttext div.sentry"),
            news_source="Siddysays",
            posted=False)

        return qmfashionItem
Example #3
0
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'meta[property="article:published_time"]::attr(content)').
            extract_first()).replace(tzinfo=None)
        try:
            modified_time = dateParse(
                response.css(
                    'meta[property="article:modified_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            return None

        id_constructor = (response.css(
            'body article.the-post::attr(id)').extract_first()).split('-')

        qmfashionItem = QmfashionItem(
            _id='karachista' + '-' + id_constructor[len(id_constructor) - 1],
            published_time=published_time,
            modified_time=modified_time,
            url=response.request.url,
            title=response.css(
                'body article.the-post header.post-header div.post-meta .post-title::text'
            ).extract_first(),
            opening_text=extract_first_paragraph(
                response, 'body article.the-post div.post-content'),
            news_source="Karachista",
            posted=False)
        return qmfashionItem
Example #4
0
	def parse_author(self, response):

		global scrape_next_page

		published_time = dateParse(response.css('meta[property="article:published_time"]::attr(content)').extract_first())
		todays_date = datetime.now()
		if published_time.date() < todays_date.date():
			scrape_next_page = False
			return None
		try:
			modified_time = dateParse(response.css('meta[property="article:modified_time"]::attr(content)').extract_first())
		except:
			modified_time = published_time
		
		id_extractor = response.css('article::attr(id)').extract_first().split('-')

		first_paragraph = extract_summary(response, "div.post-" + str(id_extractor[len(id_extractor)-1]))


		category = response.css('meta[property="article:tag"]::attr(content)').extract()
		category.append('Business')

		newsterItem = NewsterItem(
			_id = 'brecorder' + '-' + str(id_extractor[len(id_extractor)-1]),
			url = response.request.url,
			published_time = published_time,
			modified_time = modified_time,
			title = response.css('title::text').extract_first().split('|')[0],
			category = list(set(category)),
			content = '\n\n'.join(response.css('div.post-' + str(id_extractor[len(id_extractor)-1]) + ' p *::text').extract()),
			image_link = response.css('meta[property="og:image"]::attr(content)').extract_first(),
			summary = first_paragraph
			)
		return newsterItem
Example #5
0
    def parseFlight(_class, string, date):

        # Remove keywords from flight string
        removeKeywords = ['Departing flight', 'depart', 'arrive',
                          'Change Planes in', 'stop', 'stops', 'Plane Change']
        regex = '|'.join(removeKeywords)
        # Turn into list and filter out blank [""] elements
        infoList = filter(
            lambda el: el != "", re.sub(regex, "", string).split(' '))

        # Parse number of layovers
        stops = int(infoList[4]) if infoList[4] != 'Non' else 0

        # Parse departure and arrival times
        departureDT = dateParse("%s %s" % (date, infoList[2]))
        arrivalDT = dateParse("%s %s" % (date, infoList[3]))

        # If your flight goes past midnight, it must arrive the next day
        if (arrivalDT < departureDT):
            departureDT += timedelta(days=1)

        price = infoList[1].split('$')[-1]

        # Build flight info dict
        flight = {
            'flights': tuple(infoList[0].split('/')),
            'price': price,
            'depDate': departureDT,
            'arrDate': arrivalDT,
            'stops': stops,
        }

        return flight
Example #6
0
    def parse_author(self, response):

        meta = response.css('head meta')
        published_time = dateParse(
            meta.css('[property="article:published_time"]::attr(content)').
            extract_first())
        modified_time = dateParse(
            meta.css('[property="article:modified_time"]::attr(content)').
            extract_first())

        first_paragraph = extract_summary(response,
                                          "article.story .story__content")

        category = response.css(
            'meta[property="article:section"]::attr(content)').extract()
        category.append(response.request.meta['category'])

        newsterItem = NewsterItem(
            _id='dawn' + '-' +
            response.css('.story__title::attr(data-id)').extract_first(),
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=response.css('.story__title a::text').extract_first(),
            category=list(set(category)),
            content='\n\n'.join(
                response.css(
                    'article.story .story__content p *::text').extract()),
            image_link=meta.css(
                '[property="og:image"]::attr(content)').extract_first(),
            summary=first_paragraph)

        return newsterItem
Example #7
0
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'meta[property="article:published_time"]::attr(content)').
            extract_first()).replace(tzinfo=None)
        try:
            modified_time = dateParse(
                response.css(
                    'meta[property="article:modified_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            return None

        article_title = response.css(
            'meta[property="og:title"]::attr(content)').extract_first()

        qmfashionItem = QmfashionItem(
            _id='secretcloset' + '-' +
            hashlib.md5(article_title.encode('utf-8')).hexdigest(),
            published_time=published_time,
            modified_time=modified_time,
            url=response.request.url,
            title=article_title,
            opening_text=extract_first_paragraph(
                response, 'div.main-container div.blogs div.blog-details'),
            news_source="Secretcloset",
            posted=False)

        return qmfashionItem
Example #8
0
	def parseFlight(_class, string, date, points = None):
		""" General format:
		Departing flight    123(/456)   $0000    12:30AM depart    7:25AM arrive     (Non/1/2)stop    (Change planes in XXX)
		[always]			[flt1/2]    [price]  [departure]       [arrival]   		 [# stops] 		  [connection]
		"""
		removeKeywords = ['Departing flight', 'depart', 'arrive', 'Change Planes in', 'stop', 'stops', 'Plane Change']
		regex = '|'.join(removeKeywords)
		infoList = filter(lambda el: el!="", re.sub(regex, "", string).split(' '))		
		stops = int(infoList[4]) if infoList[4] != 'Non' else 0	
		
		if stops == 0:
			connecting_arpts = []
		elif ( infoList[5] not in SWAFareSpider.cities):
			connecting_arpts = []
		else:
			connecting_arpts = list(infoList[5].split('/'))
		
		departureDT = dateParse("%s %s" % (date, infoList[2]) )
		arrivalDT = dateParse("%s %s" % (date, infoList[3]) )
		if ( arrivalDT < departureDT ): arrivalDT += timedelta(days=1)
		
		flight = {
			'flight': tuple(infoList[0].split('/')),
			'price': int(infoList[1][1:].replace(",","")),
			'depart': departureDT,
			'arrive': arrivalDT,
			'depart_date' : date,
			'stops': stops,
			'connecting_arpts': connecting_arpts,
			'fare_validity_date': datetime.now(), 
			'points' : int(points.replace(",",""))
		}
		return flight
Example #9
0
def is_date(string):
    try:
        dateParse(string)
        #print(string + " is a date")
        return True
    except ValueError:
        #print(string + " is not a date")
        return False
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'meta[property="og:article:published_time"]::attr(content)').
            extract_first()).astimezone(get_localzone()).replace(tzinfo=None)

        todays_date = datetime.datetime.now(datetime.timezone.utc).astimezone(
            get_localzone())
        if published_time.date() < todays_date.date():
            return None
        try:
            modified_time = dateParse(
                response.css(
                    'meta[property="og:article:modified_time"]::attr(content)'
                ).extract_first()).astimezone(
                    get_localzone()).replace(tzinfo=None)
        except:
            modified_time = published_time

        id_extractor = str(response.request.url).split('-')

        first_paragraph = extract_summary(
            response, "div.container_17wb1 div.body_1gnLA")
        if len(first_paragraph) < 7:
            first_paragraph = extract_summary(
                response,
                "div.StandardArticleBody_container div.StandardArticleBody_body"
            )

        category = response.css(
            'meta[property="og:article:section"]::attr(content)').extract()
        category.append('Tigrosa-Internation')

        content = response.css(
            'div.container_17wb1 div.body_1gnLA p *::text').extract()
        if len(content) < 10:
            content = response.css(
                'div.StandardArticleBody_container div.StandardArticleBody_body p *::text'
            ).extract()

        newsterItem = NewsterItem(
            _id='reuters' + '-' + id_extractor[len(id_extractor) - 1],
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=response.css(
                'head title::text').extract_first().lstrip().split('|')[0],
            category=category,
            content='\n\n'.join(content),
            image_link=response.css(
                'meta[property="og:image"]::attr(content)').extract_first(),
            summary=first_paragraph)
        return newsterItem
Example #11
0
    def test_it_should_handle_time_interval(self):
      r = get_entity_value({
        'kind': 'TimeInterval',
        'from': '2017-06-07 18:00:00 +02:00',
        'to': '2017-06-08 00:00:00 +02:00',
      })

      expected_from = dateParse('2017-06-07 18:00:00 +02:00')
      expected_to = dateParse('2017-06-08 00:00:00 +02:00')

      expect(r).to.be.a(tuple)
      expect(r[0]).to.equal(expected_from)
      expect(r[1]).to.equal(expected_to)
    def parse_author(self, response):

        global scrape_next_page
        meta = response.css('head meta')
        header = response.css('div.story .template__header')

        try:
            published_time = dateParse(
                meta.css('[property="article:published_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            return None

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            scrape_next_page = False
            return None
        try:
            modified_time = dateParse(
                meta.css('[property="article:modified_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        first_paragraph = extract_summary(
            response, "div.clearfix.story-content.read-full")

        category = response.css(
            'meta[property="article:tag"]::attr(content)').extract()
        article_section = response.css(
            'meta[property="article:section"]::attr(content)').extract_first()
        if article_section.lower() == 'pakistan':
            article_section = 'national'
        category.append(article_section)

        newsterItem = NewsterItem(
            _id='tribune' + '-' +
            response.css('.story::attr(id)').extract_first().split('-')[1],
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=response.css(
                'div.story.clearfix h1.title a::text').extract_first(),
            category=list(set(category)),
            content='\n\n'.join(
                response.css('div.clearfix.story-content.read-full p *::text').
                extract()),
            image_link=response.css(
                'div.story-image-container img::attr(src)').extract_first(),
            summary=first_paragraph)
        return newsterItem
Example #13
0
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'article header.entry-header span.posted-on time[itemprop="datePublished"]::attr(content)'
            ).extract_first()).replace(tzinfo=None)

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            return None

        try:
            modified_time = dateParse(
                response.css(
                    'article header.entry-header span.posted-on time[itemprop="dateModified"]::attr(content)'
                ).extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        article_title = response.css(
            'div.container h2.entry-title::text').extract_first()

        id_extractor = response.css('article::attr(id)').extract_first().split(
            '-')

        first_paragraph = extract_summary(
            response, "article div.entry-content div.content-body")

        category = []
        if response.request.meta['category'].lower() == 'pakistan':
            category.append('National')
        else:
            category.append(response.request.meta['category'])

        newsterItem = NewsterItem(
            _id='dailypakistan' + '-' + id_extractor[len(id_extractor) - 1],
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=article_title,
            category=list(set(category)),
            content='\n\n'.join(
                response.css(
                    'article div.entry-content div.content-body p *::text').
                extract()),
            image_link=response.css(
                'article header.entry-header div[itemprop="image"] img::attr(src)'
            ).extract_first(),
            summary=first_paragraph)
        return newsterItem
    def parse_author(self, response):

        global scrape_next_page

        article_section = response.css(
            'meta[property="article:section"]::attr(content)').extract_first()
        if article_section != 'HEADLINES':
            return None

        article_title = response.css(
            'div.td-post-header h1.entry-title::text').extract_first()

        published_time = dateParse(
            response.css('meta[itemprop="datePublished"]::attr(content)').
            extract_first()).replace(tzinfo=None)
        try:
            modified_time = dateParse(
                response.css('meta[itemprop="dateModified"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            scrape_next_page = False
            return None

        child_int = 1
        while True:
            first_para_text = "div.td-post-content > p:nth-of-type(" + str(
                child_int) + ") *::text"
            first_paragraph = ''.join(response.css(first_para_text).extract())
            if len(first_paragraph) > 7:
                break
            child_int = child_int + 1

        newsterItem = NewsterItem(
            _id='pakistantoday' + '-' +
            response.css('article::attr(id)').extract_first().split('-')[1],
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=article_title,
            content='\n\n'.join(
                response.css('div.td-post-content p *::text').extract()),
            image_link=response.css(
                'div.td-post-featured-image img::attr(src)').extract_first(),
            summary=first_paragraph)
        return newsterItem
Example #15
0
def keyevent():
    body = request.get_json()
    requiredFields = ['requestTime', 'pressedKeys']
    if all(field in body for field in requiredFields):
        requestTimeStr = body['requestTime']
        pressedKeysDict = body['pressedKeys']
        pressedKeys = [int(k) for k in pressedKeysDict]
        requestTime = dateParse(requestTimeStr)
        lastRequest = dateParse(state['controller']['lastRequest'])
        if requestTime > lastRequest:
            ALLOWED_KEYS = set([32, 37, 38, 39, 40])
            allowedPressedKeys = list(ALLOWED_KEYS & set(pressedKeys))
            state['controller']['pressedKeys'] = allowedPressedKeys
            state['controller']['lastRequest'] = requestTimeStr
    return ''
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'meta[property="article:published_time"]::attr(content)').
            extract_first()).astimezone(get_localzone()).replace(tzinfo=None)

        todays_date = datetime.datetime.now(datetime.timezone.utc).astimezone(
            get_localzone())
        if published_time.date() < todays_date.date():
            return None
        try:
            modified_time = dateParse(
                response.css(
                    'meta[property="article:modified_time"]::attr(content)').
                extract_first()).astimezone(
                    get_localzone()).replace(tzinfo=None)
        except:
            modified_time = published_time

        id_extractor = response.css('article::attr(id)').extract_first().split(
            '-')

        first_paragraph = extract_summary(
            response, "article.post-" +
            str(id_extractor[len(id_extractor) - 1]) + " div.td-post-content")

        category = response.css(
            'article div.td-post-source-tags li a::text').extract()
        category.append('Tigrosa-Internation')

        newsterItem = NewsterItem(
            _id='mettisglobal' + '-' +
            str(id_extractor[len(id_extractor) - 1]),
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=response.css(
                'meta[property="og:title"]::attr(content)').extract_first(),
            category=list(set(category)),
            content='\n\n'.join(
                response.css('article.post-' +
                             str(id_extractor[len(id_extractor) - 1]) +
                             ' div.td-post-content p *::text').extract()),
            image_link=response.css(
                'meta[property="og:image"]::attr(content)').extract_first(),
            summary=first_paragraph)
        return newsterItem
Example #17
0
    def parse_author(self, response):

        published_time = dateParse(
            response.css('div.container div.category-date::text').
            extract_first()).replace(tzinfo=None)
        modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            return None

        first_paragraph = extract_summary(response, "div.story-detail")

        article_url_peices = str(response.request.url).split('/')

        newsterItem = NewsterItem(
            _id='thenews' + '-' +
            article_url_peices[len(article_url_peices) - 1].split('-')[0],
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=response.css(
                'meta[property="og:title"]::attr(content)').extract_first(),
            category=response.css(
                'body div.detail-content div.category-name h2::text').extract(
                ),
            content='\n\n'.join(
                response.css('div.story-detail p *::text').extract()),
            image_link=response.css(
                'meta[property="og:image"]::attr(content)').extract_first(),
            summary=first_paragraph)
        return newsterItem
Example #18
0
def main():
    data = json.load(open('rates.json', 'r'))

    headers = []

    for d in data:
        if d["currency"] in headers:
            pass
        else:
            headers.append(d["currency"])

    arranged_data = {}
    l_h = []
    plt.figure()

    for header in headers:
        arranged_data[header] = list(
            filter(lambda d: d["currency"] == header, data))
        dates = []
        rates = []
        for d in arranged_data[header]:
            try:
                rates.append(float(d["rate"]))
                dates.append(dateParse(d["date"]))
            except:
                pass
        h, = plt.plot(dates, rates, label=header)
        l_h.append(h)

    with open('rates_rearranged.json', 'w') as outfile:
        json.dump(arranged_data, outfile)

    plt.legend(handles=l_h)
    plt.show()
Example #19
0
def controllerLoop():
    while True:
        if state['done']:
            done()
            break
        now = datetime.now(pytz.utc)
        parsedLastRequest = dateParse(state['controller']['lastRequest'])
        dormant = (now - parsedLastRequest).seconds >= 2
        if not dormant:
            pressedKeys = state['controller']['pressedKeys']
            SPACEBAR = 32
            LEFT = 37
            UP = 38
            RIGHT = 39
            DOWN = 40
            if SPACEBAR in pressedKeys:
                state['done'] = True
            elif any(key in pressedKeys for key in [LEFT, UP, RIGHT, DOWN]):
                UNIT = 0.05
                targetX, targetY = state['currentCoordinates']
                if LEFT in pressedKeys:
                    targetX -= UNIT
                if UP in pressedKeys:
                    targetY += UNIT
                if RIGHT in pressedKeys:
                    targetX += UNIT
                if DOWN in pressedKeys:
                    targetY -= UNIT
                moved = moveArmTo(targetX, targetY)
                if moved:
                    state['currentCoordinates'] = [targetX, targetY]
        else:
            sleep(0.1)
Example #20
0
def get_forecasts(request):
  """
  :type request: Request

  """

  date = request.slot('date').first().value

  if not date:
    return request.ask('date', _('For when do you want the forecast?')) # pylint: disable=E0602
  
  location = request.slot('city').first().value

  if not location:
    return request.ask('city', _('For where do you want the forecast?')) # pylint: disable=E0602

  request.show(_("Well, I'm on it!")) # pylint: disable=E0602

  time.sleep(3) # Simulate fetching

  # Do something with the key
  api_key = request.env('WEATHER_API_KEY') # pylint: disable=W0612

  request.show(_("It's kinda sunny!"), # pylint: disable=E0602
    cards=[{
      "media": b64_icons['sunny'],
      "header": "24°C",
      "subhead": dateParse(date).strftime("%A %d %B"),
      "text": "Looks like it's sunny outside!"
    }], terminate=True)  # pylint: disable=E0602
Example #21
0
	def parse_author(self, response):
		
		published_time = dateParse(response.css('main[id="content"] article header div.entry-meta time::attr(datetime)').extract_first()).replace(tzinfo=None)
		modified_time = published_time

		todays_date = datetime.now()
		if published_time.date() < todays_date.date():
			return None

		id_constructor = response.css('div.site-content main[id="content"] article::attr(id)').extract_first().split('-')

		first_paragraph = extract_first_paragraph(response,'div.site-content main[id="content"] article div.entry-content')
		if first_paragraph is None:
			first_paragraph = response.css('div.site-content main[id="content"] article div.single-title .entry-title::text').extract_first()

		qmfashionItem = QmfashionItem(
			_id = 'sunday' + '-' + id_constructor[len(id_constructor)-1],
			published_time = published_time,
			modified_time = modified_time,
			url = response.request.url,
			title = response.css('div.site-content main[id="content"] article div.single-title .entry-title::text').extract_first(),
			opening_text = first_paragraph,
			news_source = "Sunday.com.pk",
			posted = False
			)
		return qmfashionItem
Example #22
0
def convertDate( strDate ) :

    '''
    This function accepts the date string as returned by the IMAP server and translates it in to the client's local time (zone) and returns it as a string formatted as desired in the final output.
    '''

    try:
        from dateutil.parser import parse as dateParse

    except ImportError:

        print("dateutil module missing. Try: pip install python-dateutil")
        import sys
        sys.exit(1)


    dt = dateParse( strDate.split( '(' ) [0] )		# We perform a split on the left parenthesis for the sometime possibility that the date string ends with something like (GMT-06:00)

    Local = LocalTimezone()		# create an instance of the LocalTimezone class defined above

    try:
        ldt = dt.astimezone( Local )

    except ValueError:

        print('Error - Using .astimezone(local).')
        return ''

    return ldt.strftime( '%b %d - %I:%M %P' )
Example #23
0
def getTopic(topicURL):
    #to get the topic ID we split the url twice
    topicID = topicURL.split("=")[-1].split(".")[0]
    print topicID

    tree = parsedHTML('topic', topicID, '0')

    #Get the timestamp first, if it's not from a year we want, skip it
    try:
        timestamp = tree.xpath('//div[@class="smalltext"]')[1].text_content()
    except:
        print 'NO TIMESTAMP FOUND'
        print topicURL
        return False

    todayDate = str(time.strftime('%d %B %Y'))
    timestamp = timestamp.replace('Today', todayDate)
    timestamp = timestamp.replace(' at', ",")
    timestamp = dateParse(timestamp)

    postBody = tree.xpath('//div[@class="post"]')[0].text_content()
    print postBody
    authorActivity = tree.xpath(
        '//td[@class="poster_info"]/div[@class="smalltext"]')[0].text_content(
        ).split('Activity: ')[-1].split('\n')[0]

    return [postBody, timestamp, authorActivity]
Example #24
0
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'body article[id="the-post"] p.post-meta span.tie-date::text').
            extract_first()).replace(tzinfo=None)
        modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            return None

        for item in (
                response.css('body::attr(class)').extract_first()).split(' '):
            if 'postid' in item:
                id_constructor = item.split('-')

        qmfashionItem = QmfashionItem(
            _id='trendinginsocial' + '-' +
            id_constructor[len(id_constructor) - 1],
            published_time=published_time,
            modified_time=modified_time,
            url=response.request.url,
            title=response.css(
                'article[id="the-post"] div.post-inner .post-title span::text'
            ).extract_first(),
            opening_text=extract_first_paragraph(
                response, 'article[id="the-post"] div.post-inner div.entry >'),
            news_source="Trendinginsocial",
            posted=False)

        return qmfashionItem
Example #25
0
    def parseFlight(_class, string, date):
        """ General format:
		Departing flight    123(/456)   $0000    12:30AM depart    7:25AM arrive     (Non/1/2)stop    (Change planes in XXX)
		[always]			[flt1/2]    [price]  [departure]       [arrival]   		 [# stops] 		  [connection]
		"""

        # Remove keywords from flight string
        removeKeywords = [
            'Departing flight', 'depart', 'arrive', 'Change Planes in', 'stop',
            'stops', 'Plane Change'
        ]
        regex = '|'.join(removeKeywords)
        # Turn into list and filter out blank [""] elements
        infoList = filter(lambda el: el != "",
                          re.sub(regex, "", string).split(' '))

        # Parse number of layovers
        stops = int(infoList[4]) if infoList[4] != 'Non' else 0

        # Parse connecting airports (if applicable)
        if (infoList[5] not in SWAFareSpider.cities):
            # no valid connection
            connectingArpts = None
        else:
            connectingArpts = tuple(infoList[5].split('/'))

        # Parse departure and arrival times
        departureDT = dateParse("%s %s" % (date, infoList[2]))
        arrivalDT = dateParse("%s %s" % (date, infoList[3]))

        # If your flight goes past midnight, it must arrive the next day
        if (arrivalDT < departureDT): departureDT += timedelta(days=1)

        # Build flight info dict
        flight = {
            'flight': tuple(infoList[0].split('/')),
            'price': infoList[1],
            'depart': departureDT,
            'arrive': arrivalDT,
            'stops': stops,
            'connectingArpts': connectingArpts,
            'fareValidityDate': datetime.now()
        }

        return flight
Example #26
0
def get_article_urls(end_date):
    """Main function."""
    filename = "coindesk_headlines.csv"
    urls, current_page = [], 1
    has_next_page, out_of_range = True, False

    while has_next_page and not out_of_range:
        config = results_config(current_page)
        tree = parse_html(config["coindesk"]["page_url"])
        items = tree.xpath(config["coindesk"]["item_XPATH"])

        for item in items:
            if config["coindesk"]["date_on_page"] and config["coindesk"][
                    "date_ordered"] and end_date:
                date = (dateParse(
                    item.xpath(
                        config["coindesk"]["date_XPATH"])[0].get("datetime"))
                        ).strftime("%Y-%m-%d")

                if dateParse(date) <= dateParse(end_date):
                    out_of_range = True

            url = item.xpath(config["coindesk"]["url_XPATH"])[0].get("href")

            if "://" not in url:
                url = results_config(
                    current_page)["coindesk"]["base_url"] + url

            url_filters = [
                "/videos/", "/audio/", "/gadfly/", "/features/",
                "/press-releases/"
            ]
            if any(filter in url for filter in url_filters):
                pass
            else:
                urls.append(url)

        if len(items) < config["coindesk"]["results_per_page"]:
            has_next_page = False

        collect_articles(urls, end_date, filename)

        current_page += 1
        urls = []
Example #27
0
    def parse_author(self, response):
        meta = response.css('head meta')

        category = meta.css(
            '[property="article:section"]::attr(content)').extract()
        valid_article = False

        for item in category:
            if item == "Business" or item == "National":
                valid_article = True
                break
        if not valid_article:
            return None

        published_time = dateParse(
            meta.css('[property="article:published_time"]::attr(content)').
            extract_first()).replace(tzinfo=None)

        try:
            modified_time = dateParse(
                meta.css('[property="article:modified_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        article_title = response.css('head title::text').extract_first()

        first_paragraph = extract_summary(response, "article .post-content")

        newsterItem = NewsterItem(
            _id='nation' + '-' +
            hashlib.md5(article_title.encode('utf-8')).hexdigest(),
            url=response.request.url,
            published_time=published_time,
            modified_time=modified_time,
            title=article_title,
            category=list(set(category)),
            content='\n\n'.join(
                response.css('article .post-content p *::text').extract()),
            image_link=meta.css(
                '[property="og:image"]::attr(content)').extract_first(),
            summary=first_paragraph)

        return newsterItem
Example #28
0
	def parseFlight(_class, string, date):
		""" General format:
		Departing flight    123(/456)   $0000    12:30AM depart    7:25AM arrive     (Non/1/2)stop    (Change planes in XXX)
		[always]			[flt1/2]    [price]  [departure]       [arrival]   		 [# stops] 		  [connection]
		"""


		# Remove keywords from flight string
		removeKeywords = ['Departing flight', 'depart', 'arrive', 'Change Planes in', 'stop', 'stops', 'Plane Change']
		regex = '|'.join(removeKeywords)
		# Turn into list and filter out blank [""] elements
		infoList = filter(lambda el: el!="", re.sub(regex, "", string).split(' '))
		
		# Parse number of layovers
		stops = int(infoList[4]) if infoList[4] != 'Non' else 0	
		
		# Parse connecting airports (if applicable)
		if ( infoList[5] not in SWAFareSpider.cities ):
			# no valid connection
			connectingArpts = None
		else:
			connectingArpts = tuple(infoList[5].split('/'))
		
		# Parse departure and arrival times
		departureDT = dateParse("%s %s" % (date, infoList[2]) )
		arrivalDT = dateParse("%s %s" % (date, infoList[3]) )
		
		# If your flight goes past midnight, it must arrive the next day
		if ( arrivalDT < departureDT ): departureDT += timedelta(days=1)
		
		# Build flight info dict
		flight = {
			'flight': tuple(infoList[0].split('/')),
			'price': infoList[1],
			'depart': departureDT,
			'arrive': arrivalDT,
			'stops': stops,
			'connectingArpts': connectingArpts,
			'fareValidityDate': datetime.now()
		}

		return flight
Example #29
0
    def test_it_should_handle_instant_time(self):
      r = get_entity_value({
        'kind': 'InstantTime',
        'value': '2017-06-13 18:00:00 +02:00',
        'grain': 'Hour',
        'precision': 'Exact',
      })

      expected = dateParse('2017-06-13 18:00:00 +02:00')

      expect(r).to.be.a(datetime.datetime)
      expect(r).to.equal(expected)
Example #30
0
def collect_articles(urls, end_date, filename):
    """Loops over all the URLs collected in the parent function."""

    for url in urls:
        tree = parse_html(url)
        config = page_config(tree)

        try:
            if end_date and dateParse(config["date"]) < dateParse(end_date):
                break
            else:
                csv_writer = csv.writer(
                    open(
                        os.path.dirname(os.getcwd()) + "/../data/" + filename,
                        "a"))
                csv_writer.writerow(
                    [config["date"],
                     ftfy.fix_text(config["title"]), url])
        except:
            print("\nEXCEPTION OCCURED\n")
            pass
Example #31
0
    def parse_author(self, response):

        try:
            published_time = dateParse(
                response.css(
                    'meta[property="article:published_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            published_time = datetime.now()
        try:
            modified_time = dateParse(
                response.css(
                    'meta[property="article:modified_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            raise scrapy.exceptions.CloseSpider('termination condition met')
            return None

        for item in (
                response.css('body::attr(class)').extract_first()).split(' '):
            if 'postid' in item:
                id_constructor = item.split('-')

        qmfashionItem = QmfashionItem(
            _id='mangobaaz' + '-' + id_constructor[len(id_constructor) - 1],
            published_time=published_time,
            modified_time=modified_time,
            url=response.request.url,
            title=response.css('title::text').extract_first(),
            opening_text=extract_first_paragraph(
                response,
                'article[id="post-' + id_constructor[len(id_constructor) - 1] +
                '"] div.entry-content >'),
            news_source="Mangobaaz",
            posted=False)
        return qmfashionItem
	def parse_author(self, response):

		global scrape_next_page

		published_time = dateParse(response.css('meta[property="article:published_time"]::attr(content)').extract_first()).replace(tzinfo=None)

		todays_date = datetime.now()
		if published_time.date() < todays_date.date():
			scrape_next_page = False
			return None

		try:
			modified_time = dateParse(response.css('meta[property="article:modified_time"]::attr(content)').extract_first()).replace(tzinfo=None)
		except:
			modified_time = published_time

		article_title = response.css('div.post-header h1.entry-title::text').extract_first()

		id_extractor = response.css('body::attr(class)').extract_first().split(' ')
		for item in id_extractor:
			if 'postid' in item:
				article_id = item.split('-')[len(item.split('-'))-1]

		first_paragraph = extract_summary(response, "main.content div.entry-content")

		category = response.css('meta[property="article:section"]::attr(content)').extract()
		category.append(response.request.meta['category'])

		newsterItem = NewsterItem(
			_id = 'dailytimes' + '-' + article_id,
			url = response.request.url,
			published_time = published_time,
			modified_time = modified_time,
			title = article_title,
			category = list(set(category)),
			content = '\n\n'.join(response.css('main.content div.entry-content p *::text').extract()),
			image_link = response.css('meta[property="og:image"]::attr(content)').extract_first(),
			summary = first_paragraph
			)
		return newsterItem
Example #33
0
    def parse_author(self, response):

        published_time = dateParse(
            response.css(
                'meta[property="article:published_time"]::attr(content)').
            extract_first()).replace(tzinfo=None)
        try:
            modified_time = dateParse(
                response.css(
                    'meta[property="article:modified_time"]::attr(content)').
                extract_first()).replace(tzinfo=None)
        except:
            modified_time = published_time

        todays_date = datetime.now()
        if published_time.date() < todays_date.date():
            return None

        for item in (
                response.css('body::attr(class)').extract_first()).split(' '):
            if 'postid' in item:
                id_constructor = item.split('-')

        qmfashionItem = QmfashionItem(
            _id='edition.pk' + '-' + id_constructor[len(id_constructor) - 1],
            published_time=published_time,
            modified_time=modified_time,
            url=response.request.url,
            title=response.css(
                'body article.post div[id="post-header"] .post-title::text').
            extract_first(),
            opening_text=extract_first_paragraph(
                response,
                'body article.post div[id="post-area"] div[id="content-area"] div.content-main'
            ),
            news_source="Edition.pk",
            posted=False)
        return qmfashionItem
Example #34
0
def getMinDate(genName,baseURL='http://159.203.100.177:3000'):

    url = baseURL+"/generators_times"

    querystring = {"id": "eq."+genName, "select": "min_time"}

    payload = ""
    headers = {
        'cache-control': "no-cache"
    }

    response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
    if len(response.json()):
        dateStr=response.json()[0].get('min_time')
        return dateParse(dateStr).date()
Example #35
0
def convertDate(d, df):
    result = -1
    # Sometimes dates are just the year or day.
    try:
        if d != "" and not pd.isnull(d):

            df = "%Y-%m-%d"
            dt = dateParse(d, dayfirst=df)
            result = (dt.strftime(df))

        else:
            result = d

    except ValueError:
        result = -1

    return result
Example #36
0
def convertDate(d, dayfirst):

    result = -1
    # Sometimes dates are just the year or day, so we try to parse the date and if that fails, just
    # pass the d parameter back, because it is either not a full date, or a bad value
    try:
        if d != "" and not pd.isnull(d):

            df = "%Y-%m-%d"
            dt = dateParse(d, dayfirst=dayfirst)
            result = (dt.strftime(df))

        else:
            result = d

    except ValueError:
        result = -1

    return result
Example #37
0
	def __init__(self,rawDate):
		self.rawTime=rawDate
		self.dateTime=dateParse(rawDate)
		self._unixTime=mktime(self.dateTime.timetuple())+1e-6*self.dateTime.microsecond
Example #38
0
    def gritsSearch(self, params):

        folder = self.gritsFolder()

        self.checkAccess()

        limit, offset, sort = self.getPagingParameters(params, 'meta.date')
        sDate = dateParse(params.get('start', '1990-01-01'))
        eDate = dateParse(params.get('end', str(datetime.now())))
        useRegex = 'regex' in params

        query = {
            'folderId': folder['_id'],
            'meta.date': {'$gte': sDate, '$lt': eDate}
        }

        self.addToQuery(query, params, 'country', useRegex)
        self.addToQuery(query, params, 'disease', useRegex)
        self.addToQuery(query, params, 'species', useRegex)
        self.addToQuery(query, params, 'feed', useRegex)
        self.addToQuery(query, params, 'description', useRegex)
        self.addToQuery(
            query,
            params,
            'diagnosis',
            useRegex,
            'meta.diagnosis.diseases',
            'name'
        )
        self.addToQuery(query, params, 'id', useRegex, 'name')

        model = ModelImporter().model('item')
        cursor = model.find(
            query=query,
            fields=None,
            offset=offset,
            limit=limit,
            sort=sort
        )
        result = list(cursor)
        if not self.checkAccess(priv=True, fail=False):
            result = [model.filter(i) for i in result]

        if 'randomSymptoms' in params:
            try:
                filterBySymptom = set(json.loads(params['filterSymptoms']))
            except Exception:
                filterBySymptom = False
            filtered = []
            for r in result:
                r['meta']['symptoms'] = self.getSymptomFromId(r['_id'])
                if filterBySymptom:
                    s2 = set(r['meta']['symptoms'])
                    if not filterBySymptom.isdisjoint(s2):
                        filtered.append(r)
                else:
                    filtered.append(r)
            result = filtered

        if 'geoJSON' in params:
            result = self.togeoJSON(result)
        return result
    def handleRequestSuccess(self,workQueueItem, response):
        result = json.load(response)
        if "items" in result:
            for item in result['items']:
                #database mapping
                db_meta = {'id': item['id']}

                #snippet
                snippet = item['snippet']
                db_meta['snippet_publishedAt'] = dateParse(snippet['publishedAt'])
                db_meta['snippet_channel_id'] = snippet['channelId']
                db_meta['snippet_title'] = snippet['title']
                db_meta['snippet_description'] = snippet['description']
                db_meta['snippet_channel_title'] = snippet['channelTitle']
                db_meta['snippet_category_id'] = snippet['categoryId']
                db_meta['snippet_liveBroadcastContent'] = snippet['liveBroadcastContent']
                db_meta['snippet_tags'] = json.dumps(snippet['tags']) if snippet.get('tags') else ''

                #contentDetails
                c_details = item['contentDetails']
                db_meta['contentDetails_duration'] = c_details['duration']
                db_meta['contentDetails_durationAsSeconds'] = self.ISO8601durationToSeconds(c_details['duration'])
                db_meta['contentDetails_dimension'] = c_details['dimension']
                db_meta['contentDetails_definition'] = c_details['definition']
                db_meta['contentDetails_caption'] = c_details['caption']
                db_meta['contentDetails_licensedContent'] = c_details['licensedContent']

                #status
                status = item['status']
                db_meta['status_uploadStatus'] = status['uploadStatus']
                db_meta['status_privacyStatus'] = status['privacyStatus']
                db_meta['status_license'] = status['license']
                db_meta['status_embeddable'] = status['embeddable']
                db_meta['status_publicStatsViewable'] = status['publicStatsViewable']

                #statistics
                stats = item['statistics']
                db_meta['statistics_viewCount'] = stats['viewCount']
                db_meta['statistics_likeCount'] = stats.get('likeCount') or ''
                db_meta['statistics_dislikeCount'] = stats.get('dislikeCount') or ''
                db_meta['statistics_favoriteCount'] = stats['favoriteCount']
                db_meta['statistics_commentCount'] = stats['commentCount']

                #recordingDetails
                def deep_get(item, *attrs):
                    ''' Get item, or return fallback value from nested dicts '''
                    if item and not isinstance(item, dict):
                        return item
                    if not item:
                        return None
                    return deep_get(item.get(attrs[0]), *attrs[1:])

                if deep_get(item, 'recordingDetails', 'recordingDate'):
                    db_meta['recordingDetails_recordingDate'] = dateParse(deep_get(item, 'recordingDetails', 'recordingDate'))
                else:
                    db_meta['recordingDetails_recordingDate'] = datetime.utcfromtimestamp(0)

                db_meta['recordingDetails_location_latitude'] = deep_get(item, 'recordingDetails', 'location', 'latitude') or 0
                db_meta['recordingDetails_location_longitude'] = deep_get(item, 'recordingDetails', 'location', 'longitude') or 0
                db_meta['recordingDetails_location_altitude'] = deep_get(item, 'recordingDetails', 'location', 'altitude') or 0

                self.resultList[item['id']] = db_meta
Example #40
0
 def toDatetime(string):
     try:
         return dateParse(self.formatInputString(string))
     except:
         return None
def main():
    # === Extract options ===
    parser = OptionParser(usage="usage: %prog [options] <timeColumn> <timeInterval> <groupByColumn> ...")
    parser.add_option(
        '-p', '--pivot', dest='pivot', action='store_true', default=False,
        help='store the data until the end of the stream and then pivot it into groupByCol groups (SIGNIFICANT MEMORY USAGE)' # noqa
    )
    parser.add_option('-s', dest='sep', default='|', help='groupByCol separator when pivoting')
    parser.add_option('-m', '--multplier', dest='multiplier', default=100)
    (options, args) = parser.parse_args()

    if len(args) < 3:
        parser.print_usage()
        exit()

    pivot = options.pivot
    colNameSep = options.sep
    multiplier = int(options.multiplier)
    timeCol = int(args[0])
    interval = int(args[1])
    groupCols = []
    for i in range(2, len(args)):
        groupCols.append(int(args[i]))

    # Data is a complex data structure; with the following layout:
    # Timestamp (start of interval)
    # -- {(col1 val, col2 val, col3 val...)}
    # -- -- Count
    data = {}

    # Similarly, if we're pivoting we'll keep track of unique columns through time
    # (col1 val, col2 val, col3 val...)
    uniqueCols = set()

    lineCount = 0
    for line in sys.stdin:
        parts = line.strip().split(' ')

        # Find the agg time
        ctime = int(dateParse(parts[timeCol]).strftime('%s'))  # Yes, this is horribly inefficient; meh
        ctime = (ctime / interval) * interval

        colVals = []
        for i in groupCols:
            colVals.append(parts[i])
        colVals = tuple(colVals)

        if ctime not in data:
            data[ctime] = {}
        if colVals not in data[ctime]:
            data[ctime][colVals] = 1
        else:
            data[ctime][colVals] += 1

        if not pivot:
            lineCount = (lineCount + 1) % 1000
            if lineCount == 0:
                # Flush the buffers if possible
                for ptime in sorted(data.keys()):
                    if ptime + (2 * interval) < ctime:
                        for dataline in data[ptime]:
                            sys.stdout.write("%s\t%s\t%s\n" % (
                                datetime.fromtimestamp(ptime).strftime('%Y-%m-%d %H:%M:%S'),
                                colNameSep.join(dataline),
                                data[ptime][dataline] * multiplier
                            ))
                        del data[ptime]
        else:
            uniqueCols.add(colVals)

    # And here we are at the end...
    if pivot:
        # Must create the BIG table now
        outline = ['time']
        for cols in uniqueCols:
            outline.append(colNameSep.join(cols))
        sys.stdout.write("\t".join(outline))
        sys.stdout.write("\n")

        for ptime in sorted(data.keys()):
            outline = [datetime.fromtimestamp(ptime).strftime('%Y-%m-%d %H:%M:%S')]
            for cols in uniqueCols:
                if cols in data[ptime]:
                    outline.append(str(data[ptime][cols] * multiplier))
                else:
                    outline.append('0')
            sys.stdout.write("\t".join(outline))
            sys.stdout.write("\n")
Example #42
0
	def __init__(self, fromCity=None, date=None, toCity=None, *args, **kwargs):
		super(SWAFareSpider, self).__init__(**kwargs)
		self.origin = fromCity
		self.outDate = dateParse(date)
		self.destination = toCity