def __init__(self, url='', username='', userTitle='', id='', name='', avatar='', link='', score=0.0, feedbackOverall=0, feedbackSummaryList=[], feedbackRatings=[], reviews=[], lastUpdate=0): self.url = url self.url = LinksHelper.fix_url(self.url) self.username = username self.userTitle = userTitle self.id = id self.name = name self.avatar = avatar self.link = link self.score = score self.feedbackOverall = feedbackOverall self.feedbackSummaryList = feedbackSummaryList self.feedbackRatings = feedbackRatings self.reviews = reviews self.lastUpdate = lastUpdate
def __init__(self, url, type, originalId, id, author, categories, title, description, images, timeLeft, price, details, date, ratings, shipping, reviews, lastUpdate ): self.url = url self.url = LinksHelper.fix_url(self.url) self.type = type self.originalId = originalId self.id = id self.author = author self.categories = categories self.title = title self.timeLeft = timeLeft self.details = details self.description = description self.images = images self.price = price self.date = date self.ratings = ratings self.reviews = reviews self.lastUpdate = lastUpdate self.shipping = shipping
def __init__(self, url, authorUsername, authorFullName, date, score, title, body, purchased, thumbsUp, thumbsDown, lastUpdate=''): self.url = url self.url = LinksHelper.fix_url(self.url) self.authorUsername = authorUsername self.authorFullName = authorFullName self.date = date self.title = title self.body = body self.purchased = purchased self.thumbsUp = thumbsUp self.thumbsDown = thumbsDown self.score = score self.lastUpdate = lastUpdate
def findLJSONObjectAlready(website, url='', title='', description='', allowTitleIncluded=False): url = LinksHelper.fix_url(url) global arrJSONObjects if (website in arrJSONObjects) == False: if JSONDB.readJSONObjectsFiles(website) == False: print("IT DOESNT WORK....") return None list = arrJSONObjects[website] if list is not None: for object in list: # print(list) # print("findObject", object, hasattr(object, 'title')) print(object.title, title, title == object.title) if ((hasattr(object, 'url'))and(url == object.url)) or \ ((title != '') and (hasattr(object, 'title')) and (title == object.title)) or \ ((description != '') and (hasattr(object, 'description')) and (description == object.description)): return object if allowTitleIncluded and (title in object.title or object.title in title): return object return None
def __init__(self, url, type, id, title, parent): self.url = url self.url = LinksHelper.fix_url(self.url) self.type = type self.id = id self.title = title self.parent = parent
def parse(self, response): url = response.url if self.testingURL != '': response = LinksHelper.getRequestTrials(self.session, self.testingURL, {}, {}, maxTrials=5) html = response.text response = Selector(text=html) url = self.testingURL self.parseResponse(response, url) print(url, "data", response, self.ONLY_ONE_PAGE) if self.ONLY_ONE_PAGE == False: for next_page in response.css('a'): next_page = self.extractFirstElement(next_page.xpath('@href')) sharpIndex = next_page.find('#') if sharpIndex >= 0: next_page = next_page[0:sharpIndex] parsed_url = urlparse(next_page) if bool(parsed_url.scheme) == False: newUrl = self.url.rstrip('/') newUrl = newUrl.rstrip('/') next_page = newUrl + next_page # print(next_page) if self.MAXIMUM_NUMBER_PAGES == 0 or len( self.linksQueue) < self.MAXIMUM_NUMBER_PAGES: self.addLink(next_page) try: while self.linksQueueIndex < len(self.linksQueue): self.linksQueueIndex += 1 # print("self.linksQueue", len(self.linksQueue) ) # print("self.linksQueue[self.linksQueueIndex]", self.linksQueue[self.linksQueueIndex]) yield scrapy.Request( url=self.linksQueue[self.linksQueueIndex - 1], callback=self.parse) except ValueError: pass
def getAddress(city, country): address = city + ' ' + country address = address.replace(' ', '+') session = requests.Session() result = LinksHelper.getRequestTrials(session, 'https://maps.google.com/maps/api/geocode/json?address='+address+'&sensor=false', {}, {}, maxTrials=5) result = result.json() if len(result['results']) > 0: result =result['results'][0]['geometry']['location'] else: result = None return result
def postAddForum(rootURL, url, user, parentId, name, title, description, iconPic, coverPic, arrKeywords = [], dtOriginalDate = None, country='', city='', language='', latitude=-666, longitude=-666): user = ServerAPI.loginUser(user) if user is None: return False title = LinksHelper.fixArchiveStrings(title) description = LinksHelper.fixArchiveStrings(description) iconPic = LinksHelper.fixArchiveStrings(iconPic) coverPic = LinksHelper.fixArchiveStrings(coverPic) description = LinksHelper.fix_relative_urls(description, rootURL) rez = ServerAPI.processLocation(country, city, language, latitude, longitude) latitude = rez[0] longitude = rez[1] arrAdditionalInfo = { 'scraped':True, 'source': { 'page': url, 'website': rootURL, } } if dtOriginalDate is not None: arrAdditionalInfo['dtOriginal'] = dtOriginalDate if isinstance(arrKeywords, str): keywords = arrKeywords else: keywords = ','.join(str(e) for e in arrKeywords) data = { 'id': user['id'], 'sessionId': user['sessionId'], 'parent': parentId, 'title': title, 'name': name, 'description': description, 'iconPic': iconPic, 'coverPic': coverPic, 'keywords': keywords, 'country': country, 'city': city, 'language': language, 'latitude': latitude, 'longitude': longitude, 'additionalInfo': ujson.dumps(arrAdditionalInfo) } headers = {} result = LinksHelper.getRequestTrials(session, SERVER_URL + "forums/add-forum", data, headers, maxTrials = 5) result = result.json() #print(result) if result['result'] == True: print('FORUM new ', result['forum']['URL']) return result['forum']['id'] else: print("ERROR adding new forum ", result) return None
def postAddReply(rootURL, user, parentId, parentReplyId, title, description, arrKeywords = [], arrAttachments=[], dtOriginalDate = None, country='', city='', language='', latitude=-666, longitude=-666, authorName='', authorAvatar='' ): user = ServerAPI.loginUser(user) if user is None: return False if parentId is None: return False if parentReplyId is None: parentReplyId = "" title = LinksHelper.fixArchiveStrings(title) description = LinksHelper.fixArchiveStrings(description) authorAvatar = LinksHelper.fixArchiveStrings(authorAvatar) authorName = LinksHelper.fixArchiveStrings(authorName) description = LinksHelper.fix_relative_urls(description, rootURL) rez = ServerAPI.processLocation(country, city, language, latitude, longitude) latitude = rez[0] longitude = rez[1] arrAdditionalInfo = { 'scraped':True, } if dtOriginalDate is not None: arrAdditionalInfo['dtOriginal'] = dtOriginalDate if authorName != '': arrAdditionalInfo['orgName'] = authorName if authorAvatar != '': arrAdditionalInfo['orgAvatar'] = authorAvatar if isinstance(arrKeywords, str): keywords = arrKeywords else: keywords = ','.join(str(e) for e in arrKeywords) data = { 'id': user['id'], 'sessionId': user['sessionId'], 'parent': parentId, 'parentReply': parentReplyId, 'title': title, 'description': description, 'keywords': keywords, 'attachments': ujson.dumps(arrAttachments), 'country': country, 'city': city, 'language': language, 'latitude': latitude, 'longitude': longitude, 'additionalInfo': ujson.dumps(arrAdditionalInfo) } headers = {} result = LinksHelper.getRequestTrials(session, SERVER_URL + "replies/add-reply", data, headers, maxTrials = 5) result = result.json() #print(result) if result['result'] == True: print('reply new ', result['reply']['URL']) return result['reply']['id'] else: print("ERROR adding new reply ",result) return None
def addLinkVisited(website, url): url = LinksHelper.fix_url(url) if LinksDB.checkLinkVisitedAlready(website, url) == True: return False global arrLinksVisited if (website in arrLinksVisited) == False: if LinksDB.readLinksVisitedFiles(website) == False: arrLinksVisited[website] = [] arrLinksVisited[website].append(url)
def checkLinkVisitedAlready(website, url): url = LinksHelper.fix_url(url) global arrLinksVisited if (website in arrLinksVisited) == False: if LinksDB.readLinksVisitedFiles(website) == False: return False list = arrLinksVisited[website] if list is not None: if url in list: return True return False
def processURL(self, initialURL, timestamp, endtimestamp): # https://web.archive.org/web/20130502222444/http://hackpedia.info/viewtopic.php?f=43&t=16653&p=116862&sid=2a60dce4bac29bf5b399c5741f4e5cb3 #initialURL = "http://hackpedia.info/viewtopic.php?f=14&t=14764&sid=97ffaea0727ec816f88a27e7a6778587" for rejection in self.crawler.rejectionSubstr: if rejection in initialURL: return None url = "http://web.archive.org/web/" + endtimestamp + "/" + initialURL data = {} headers = {} response = LinksHelper.getRequestTrials(self.session, url, data, headers, maxTrials=5) # html = html.content # html = html.decode("utf-8") html = response.text #print(html) #print(type(html)) sel = Selector(text=html) date = timestamp date = date[:4] + '-' + date[4:] date = date[:6 + 1] + '-' + date[6 + 1:] date = date[:8 + 2] + ' ' + date[8 + 2:] date = date[:10 + 3] + ':' + date[10 + 3:] date = date[:12 + 4] + ':' + date[12 + 4:] self.crawler.date = date self.crawler.parseResponse(sel, initialURL)
def findLinkObjectAlready(website, url='', title='', description='', allowTitleIncluded=False, similarity=False): url = LinksHelper.fix_url(url) global arrLinksObjects if (website in arrLinksObjects) == False: if LinksDB.readLinkObjectsFiles(website) == False: print("IT DOESNT WORK....") return None list = arrLinksObjects[website] if list is not None: for object in list: # rint(list) # print("findObject", object, hasattr(object, 'title')) # print( object.title, title, title == object.title) if ((hasattr(object, 'url'))and(url == object.url)) or \ ((title != '') and (hasattr(object, 'title')) and (title == object.title)) or \ ((description != '') and (hasattr(object, 'description')) and (description == object.description)): return object if similarity: if title != '' and hasattr(object, 'title'): if SequenceMatcher(None, title, object.title).ratio() >= 0.7: return object if description != '' and hasattr(object, 'description'): if SequenceMatcher(None, description, object.description).ratio() >= 0.7: return object if allowTitleIncluded and (title in object.title or object.title in title): return object return None
def postAddProduct(rootURL, url, user, parentId, title, description, shortDescription='', arrKeywords=[], arrAttachments=[], dtOriginalDate=None, country='', city='', language='', latitude=-666, longitude=-666, itemId='', author=None, timeLeft=0, details=None, price=None, ratingScoresList=None, shipping=None, reviewsList=None, lastUpdate=''): user = ServerAPI.loginUser(user) if user is None: return False title = LinksHelper.fixArchiveStrings(title) description = LinksHelper.fixArchiveStrings(description) shortDescription = LinksHelper.fixArchiveStrings(shortDescription) authorAvatar = LinksHelper.fixArchiveStrings(author.avatar) authorName = LinksHelper.fixArchiveStrings(author.username) description = LinksHelper.fix_relative_urls(description, rootURL) rez = ServerAPI.processLocation(country, city, language, latitude, longitude) latitude = rez[0] longitude = rez[1] arrAdditionalInfo = { 'scraped': True, 'source': { 'page': url, 'website': rootURL, }, 'itemId': itemId, 'timeLeft': timeLeft, } if dtOriginalDate is not None: arrAdditionalInfo['dtOriginal'] = dtOriginalDate if authorName != '': arrAdditionalInfo['orgName'] = authorName if authorAvatar != '': arrAdditionalInfo['orgAvatar'] = authorAvatar if isinstance(arrKeywords, str): keywords = arrKeywords else: keywords = ','.join(str(e) for e in arrKeywords) data = { 'id': user['id'], 'sessionId': user['sessionId'], 'parent': parentId, 'title': title, 'description': description, 'shortDescription': shortDescription, 'keywords': keywords, 'attachments': ujson.dumps(arrAttachments), 'country': country, 'city': city, 'language': language, 'latitude': latitude, 'longitude': longitude, 'additionalInfo': ujson.dumps(arrAdditionalInfo), # additional product information 'author': ujson.dumps(author.getJSON()), 'details': ujson.dumps(details.getJSON()), 'price': ujson.dumps(price.getJSON()), 'ratingScoresList': ujson.dumps(ratingScoresList.getJSON()), 'shipping': ujson.dumps(shipping.getJSON()), 'reviewsList': ujson.dumps(reviewsList.getJSON()), 'lastUpdate': lastUpdate } headers = {} result = LinksHelper.getRequestTrials(session, SERVER_URL + "topics/add-topic", data, headers, maxTrials = 5) result = result.json() # print(result) if result['result'] == True: print('topic new ', result['topic']['URL']) return result['topic']['id'] else: print("ERROR adding new topic ", result) return None