def sync(obj: Base, *args, **kwargs): base_url = obj.sync_type.base_url res = Internet.html_get(base_url) h2_list = res.html.find(".post-card-title")[::-1] for h2 in h2_list: a = h2.find("a", first=True) link = urllib.parse.urljoin(base_url, a.attrs.get('href')) print(a.text.strip(), link) obj.add_text_task(unique_key=link, name=a.text.strip(), url=link, data={})
def sync(obj: Base, *args, **kwargs): soup = Internet.get_soup(obj.sync_type.base_url) ul = soup.find('ul', {'id': 'posts-container'}) for li in ul.find_all('li')[::-1]: data = { "text": li.find('h3').find('a').get('href').strip(), "title": li.find('h3').find('a').text.strip() } obj.add_text_task(unique_key=data['text'], name=data['title'], url=data['text'], data=data)
def sync(obj: Base, *args, **kwargs): url = obj.sync_type.base_url article_url = obj.sync_type.extras.get('article_url') res = Internet.post_phjs(url, return_json=True) posts = res['result']['data']['nodes'][::-1] for post in posts: post['text'] = urllib.parse.urljoin(article_url, post.get('articleLink')) obj.add_text_task(unique_key=post.get('id'), name=post['title'], url=post['text'], data=post)
def sync(obj: Base, *args, **kwargs): tag = kwargs.get("tag", "programming") url = obj.sync_type.base_url.format(tag=tag) post_url = obj.sync_type.extras.get("post_url") soup = Internet.get_soup_phjs(url) posts = soup.find_all('div', {'class': 'title-wrapper'})[::-1] for post in posts: a = post.find('a') link = urllib.parse.urljoin(post_url, a.get('href').strip()) obj.add_text_task(unique_key=link, name=a.text.strip(), url=link, data=dict(title=a.text.strip(), text=link))
def sync(obj: Base, *args, **kwargs): price_trackers = obj.get_price_trackers() for pt in price_trackers: # print("syncing product:", pt.title) info = Tracker.get_poduct_info(pt.productUrl) obj.updated_price_tracker(info) content = ( f"Title: {info.title}\n" f"Current Price: {info.curr_price}\n" f"High Price: {info.high_price}\n" f"Low Price: {info.low_price}\n" f"URL: {info.productUrl}\n" ) obj.add_text_task(task_id=f"{obj.job.id}:{info.title}", data=dict(text=content))
def sync(obj: Base, *args, **kwargs): soup = Internet.get_soup_phjs(obj.sync_type.base_url) links = soup.find_all('a', {'class': 'o-eZTujG o-fyWCgU'}) for a in links[::-1]: link = a.get('href') url = urllib.parse.urljoin(obj.sync_type.base_url, link) name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data=dict(text=url))
def sync(obj: Base, *args, **kwargs): r = Internet.html_get(obj.sync_type.base_url) links = r.html.xpath( '/html/body/div[1]/div[2]/div/div[4]/div[2]/section/div[*]/div[*]/div[2]/a' ) for a in links[::-1]: url = a.attrs.get('href').split("?")[0] name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data=dict(text=url))
def sync(obj: Base, *args, **kwargs): client = NewsApiClient(api_key=os.environ.get('NEWS_API_KEY')) # countries https://github.com/mattlisiv/newsapi-python/blob/master/newsapi/const.py country = kwargs.get("country", "in") top_headlines = client.get_top_headlines(language="en", country=country) articles = top_headlines.get('articles')[::-1] for article in articles: article['text'] = article['url'] obj.add_text_task(unique_key=article.get("url"), name=article['title'], url=article['url'], data=article)
def sync(obj: Base, *args, **kwargs): res = Internet.html_get(obj.sync_type.base_url) links = res.html.xpath( "/html/body/form/div[4]/div[3]/div/div[1]/div[*]/div/div[1]/h3/a") for a in links[::-1]: link = a.attrs.get('href') url = urllib.parse.urljoin(obj.sync_type.base_url, link) name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data=dict(text=url))
def sync(obj: Base, *args, **kwargs): r = Internet.html_get(obj.sync_type.base_url) found_links = [] xpaths = obj.sync_type.extras.get("xp") for xpath in xpaths: links = r.html.xpath(xpath) if links: found_links.extend(links) for a in found_links[::-1]: url = a.attrs.get('href').split("?")[0] name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data={})
def sync(obj: Base, *args, **kwargs): cat = kwargs.get("cat", "") url = urllib.parse.urljoin(obj.sync_type.base_url, cat) res = Internet.html_get(url) h2_list = res.html.find(".crayons-story__title") for h2 in h2_list[::-1]: a = h2.find('a', first=True) url = urllib.parse.urljoin(obj.sync_type.base_url, a.attrs.get('href')) obj.add_text_task( unique_key=a.attrs.get('id').strip(), name=a.text.strip(), url=url, data=dict(text=url) )
def sync(obj: Base, *args, **kwargs): r = Internet.html_get(obj.sync_type.base_url) links = [] xpaths = obj.sync_type.extras.get("xp") for xpath in xpaths: links = r.html.xpath(xpath) if links: break article_url = obj.sync_type.extras.get("article_url") for a in links[::-1]: path = a.attrs.get('href') url = urllib.parse.urljoin(article_url, path) name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data={})
def sync(obj: Base, *args, **kwargs): group, typ = (kwargs.get("group") or os.environ.get("NG_DEFAULT_GROUP"), kwargs.get("type") or os.environ.get("NG_DEFAULT_TYPE")) url = obj.sync_type.base_url.format(grp=group, typ=typ) data = Internet.post_phjs(url=url, return_json=True) posts = data.get("data").get("posts")[::-1] for post in posts: data = { "caption": "{}\n{}".format(post.get("title"), post.get("url")), "title": post.get("title"), "nsfw": post.get("nsfw"), "post_url": post.get("url"), "content_type": post.get("type"), "up_vote": post.get("upVoteCount"), "down_vote": post.get("downVoteCount"), "description": post.get("description"), "comments_count": post.get("commentsCount"), } # check post type if post["type"] == "Photo": data["url"] = post.get("images").get("image700").get("url") obj.add_photo_task(unique_key=post.get("id"), name=post['title'], url=data["url"], data=data) elif post["type"] == "Animated": data["url"] = post.get("images").get("image460sv").get("url") obj.add_video_task(unique_key=post.get("id"), name=post['title'], url=data["url"], data=data)
def sync(obj: Base, *args, **kwargs): url = obj.sync_type.base_url soup = Internet.get_soup_phjs(url) divs = soup.find_all('div', {'class': 'entry-grid-content hgrid-span-12'}) for div in divs[::-1]: h2 = div.find("h2", {"class": "entry-title"}) a = h2.find('a') url = a.get('href') name = a.text.strip() desc_div = div.find("div", {"class": "entry-summary"}) if desc_div: desc = desc_div.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data=dict(text=url, desc=desc))
class Sync(object): def __init__(self, sync_type: str, db: Session, request: Request, *args, **kwargs): self.sync_type = sync_type self.job_id = f"{sync_type}:{self.get_current_time()}:{uuid4()}" self.obj = Base(sync_type, self.job_id, db, request) self.args = args self.kwargs = kwargs def start(self): """ execute the sync method """ # check is lock is already acquired if not self.obj.sync_type.locked: try: self.obj.lock() # sync only if its enabled if self.obj.sync_type.enabled: SYNC_GRABBERS[self.sync_type](self.obj, *self.args, **self.kwargs) self.obj.write_tasks() except Exception as e: self.run(self.obj.job_failed) else: self.run(self.obj.job_success) self.obj.notify() finally: self.obj.release() def run(self, func, *args, **kwargs): """ runs given funciton based sync type enabled or not """ if self.obj.sync_type.enabled: func(*args, **kwargs) def get_current_time(self): fmt = "%H.%M-%D" utcmoment_naive = datetime.utcnow() utcmoment = utcmoment_naive.replace(tzinfo=pytz.utc) tz = os.environ.get("TZ") conv_dt = utcmoment.astimezone(pytz.timezone(tz)) return conv_dt.strftime(fmt)
def sync(obj: Base, *args, **kwargs): # https://groww.in/slr/v1/search/derived/scheme?available_for_investment=true&doc_type=scheme&page=0&plan_type=Direct&q=&size=16&sort_by=3 # sort_by 1: Rating High to low # sort_by 2: Rating Low to high # sort_by 3: Rating popularity data = Internet.post_phjs(url=obj.sync_type.base_url, return_json=True)['content'] for post in data.get("data").get("posts"): data = { "caption": "{}\n{}".format(post.get("title"), post.get("url")), "title": post.get("title"), "nsfw": post.get("nsfw"), "post_url": post.get("url"), "content_type": post.get("type"), "up_vote": post.get("upVoteCount"), "down_vote": post.get("downVoteCount"), "description": post.get("description"), "comments_count": post.get("commentsCount") } # check post type if post["type"] == "Photo": data["url"] = post.get("images").get("image700").get("url") obj.add_photo_task(unique_key=post.get("id"), name=post['title'], url=post.get("url"), data=data) elif post["type"] == "Animated": data["url"] = post.get("images").get("image460sv").get("url") obj.add_video_task(unique_key=post.get("id"), name=post['title'], url=post.get("url"), data=data)
def sync(obj: Base, *args, **kwargs): cat = kwargs.get("cat", "") url = obj.sync_type.base_url.format(cat=cat) res = Internet.html_get(url) xpaths = [ "/html/body/main/div[2]/div/div/div[1]/div/div[2]/div/article[*]/div/div[2]/a", "/html/body/main/div[2]/div/div/div[1]/div/article[*]/div/div[2]/a", ] links = [] for xpath in xpaths: links = res.html.xpath(xpath) if links: break f_url = obj.sync_type.extras.get("base_url") for a in links[::-1]: link = urljoin(f_url, a.attrs.get("href")) name = a.text.strip().replace("\n", "--") obj.add_text_task(unique_key=link, name=name, url=link, data={})
def sync(obj: Base, *args, **kwargs): category = kwargs.get("category", "Startup") url = obj.sync_type.base_url.format(category=category) soup = Internet.get_soup_phjs(url) lis = soup.find_all('li', {'class': 'sc-hMFtBS gpleaq'})[::-1] # li are returend in double for li in lis: a = li.find('a') div = li.find('div', {'class': 'sc-gqPbQI iIXuvz'}) title_a = div.find("a") name = title_a.text.strip() if name.strip() == "": continue link = urllib.parse.urljoin(obj.sync_type.base_url, a.get('href').strip()) obj.add_text_task(unique_key=link, name=name, url=link, data=dict(text=link))
def __init__(self, sync_type: str, db: Session, request: Request, *args, **kwargs): self.sync_type = sync_type self.job_id = f"{sync_type}:{self.get_current_time()}:{uuid4()}" self.obj = Base(sync_type, self.job_id, db, request) self.args = args self.kwargs = kwargs