class MyLinkedInMessage(QThread): notifyLinkedIn = pyqtSignal(object) def __init__(self, spider, memberList, subject, message): QThread.__init__(self) # self.spider = Spider() self.spider = spider self.regex = Regex() self.memberList = memberList self.subject = unicode(subject) self.message = unicode(message) def run(self): self.sendMessage() self.notifyLinkedIn.emit( '<font color=red><b>Finish Sending All Messages.</b></font>') def sendMessage(self): print self.memberList for member in self.memberList: messageUrl = 'http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=' + str( member[1]) print messageUrl # messageUrl = 'http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=' + '65471931' # data = self.spider.fetchData('http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=65471931') data = self.spider.fetchData(messageUrl) data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) fromName = self.regex.getSearchedData( '(?i)<input type="hidden" name="fromName" value="([^"]*)"', data) fromEmail = self.regex.getSearchedData( '(?i)<input type="hidden" name="fromEmail" value="([^"]*)"', data) # connectionIds = self.regex.getSearchedData('(?i)<input type="hidden" name="connectionIds" value="([^"]*)"', data) csrfToken = self.regex.getSearchedData( '(?i)<input type="hidden" name="csrfToken" value="([^"]*)"', data) sourceAlias = self.regex.getSearchedData( '(?i)<input type="hidden" name="sourceAlias" value="([^"]*)"', data) linkedInSubject = u'Hi ' + unicode( member[0]).split(' ')[0] + self.subject linkedInMessage = u'Hi ' + unicode( member[0]).split(' ')[0] + u',\n' + self.message print linkedInMessage params = { 'addMoreRcpts': 'false', 'ajaxSubmit': 'Send Message', 'allowEditRcpts': 'true', 'body': linkedInMessage, 'connectionIds': str(member[1]), 'connectionNames': '', 'csrfToken': csrfToken, 'fromEmail': fromEmail, 'fromName': fromName, 'itemID': '', 'openSocialAppBodySuffix': '', 'showRecipeints': 'showRecipeints', 'sourceAlias': sourceAlias, 'st': '', 'subject': linkedInSubject, 'submit': 'Send Message', 'viewerDestinationUrl': '' } print params msgUrl = 'http://www.linkedin.com/msgToConns?displayCreate=' data = self.spider.fetchData(msgUrl, params) data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) if self.regex.isFoundPattern('(?i)<div class="alert success">', data): print 'Message Sent.' self.notifyLinkedIn.emit( '<font color=green><b>Successfully Sent Message To: %s</b></font>' % member[0]) else: self.notifyLinkedIn.emit( '<font color=red><b>Something Wrong during Send Message To</b></font>' % member[0]) # params = {'addMoreRcpts': 'false', # 'ajaxSubmit': 'Send Message', # 'allowEditRcpts': 'true', # 'body': 'Script Test', # 'connectionIds': '65471931', # 'connectionNames': '', # 'csrfToken': 'ajax: 6539671039643459056', # 'fromEmail': '467728216', # 'fromName': 'Mehedi Hasan', # 'itemID': '', # 'openSocialAppBodySuffix': '', # 'showRecipeints': 'showRecipeints', # 'sourceAlias': '0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn', # 'st': '', # 'subject': 'Script Test', # 'submit': 'Send Message', # 'viewerDestinationUrl': ''} #<input type="hidden" name="fromName" value="Mehedi Hasan" id="fromName-msgForm"> # <input type="hidden" name="showRecipeints" value="showRecipeints" id="showRecipeints-msgForm"> # <input type="hidden" name="fromEmail" value="467728216" id="fromEmail-msgForm"> # <input type="hidden" name="connectionIds" value="65471931" id="connectionIds-msgForm"> # <input type="hidden" name="connectionNames" value="" id="connectionNames-msgForm"> # <input type="hidden" name="allowEditRcpts" value="true" id="allowEditRcpts-msgForm"> # <input type="hidden" name="addMoreRcpts" value="false" id="addMoreRcpts-msgForm"> # <input type="hidden" name="itemID" value="" id="itemID-msgForm"> # <input type="hidden" name="openSocialAppBodySuffix" value="" id="openSocialAppBodySuffix-msgForm"> # <input type="hidden" name="st" value="" id="st-msgForm"> # <input type="hidden" name="viewerDestinationUrl" value="" id="viewerDestinationUrl-msgForm"> # <input type="hidden" name="csrfToken" value="ajax:6539671039643459056" id="csrfToken-msgForm"> # <input type="hidden" name="sourceAlias" value="0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn" id="sourceAlias-msgForm"> """ msgUrl1 = 'http://www.linkedin.com/msgToConns?displayCreate=' msgParams = {} addMoreRcpts false ajaxSubmit Send Message allowEditRcpts true body fdgdfgdfgdfg dg d connectionIds 57414219 connectionNames csrfToken ajax:3480949306085123249 fromEmail 467728216 fromName Mehedi Hasan goback .con.npv_57414219_*1_*1_name_r5tN_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1 itemID openSocialAppBodySuffix showRecipeints showRecipeints sourceAlias 0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn st subject viewerDestinationUrl """ """addMoreRcpts false
class Scrapper(QThread): notifyScrapper = pyqtSignal(object) isFinished = False def __init__(self, urllist): QThread.__init__(self) self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() print urllist self.urllist = urllist self.csv = Csv('scrapper.csv') def run(self): self.scrapData() self.notifyScrapper.emit( '<font color=green><b>------------------ Finish! ------------------------- </b></font>') def scrapData(self): try: total = 0 csvHeader = ['URL', 'Title', 'Price', 'Brand', 'Features', 'Material', 'Measurements', 'Category', 'Size', 'Color', 'Design'] self.csv.writeCsvRow(csvHeader) if self.isFinished: return for url in self.urllist: if len(url) > 0: url = self.regex.replaceData('(?i)\r', '', url) url = self.regex.replaceData('(?i)\n', '', url) url = self.regex.getSearchedData('(?i)(http.*?)$', url) print 'URL: ', url self.notifyScrapper.emit(('<font color=green><b>URL: %s</b></font>' % url)) data = self.spider.fetchData(url) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) soup = BeautifulSoup(data) soup.prettify() title = '' price = '' size = '' brand = '' features = '' material = '' measurements = '' category = '' color = '' design = '' if soup.find('span', id='vi-lkhdr-itmTitl') is not None: title = soup.find('span', id='vi-lkhdr-itmTitl').text if soup.find('span', id='prcIsum'): price = soup.find('span', id='prcIsum').text if soup.find('div', class_='itemAttr'): specchunk = soup.find('div', class_='itemAttr') rows = specchunk.find_all('tr') for row in rows: cols = row.find_all('td') for i in range(0, len(cols), 2): # if self.regex.isFoundPattern('(?i)Condition:', cols[i].text.strip()): # conditionChunk = cols[i + 1] # conditionChunk = self.regex.replaceData(u'(?i)<span class="infoLink u-nowrap" id="readFull">.*?</span>', '', unicode(conditionChunk)) # conditionChunk = self.regex.replaceData(u'(?i)<b class="g-hdn">.*?</b>', '', conditionChunk) # condition = BeautifulSoup(conditionChunk).text # print condition if self.regex.isFoundPattern('(?i)Brand:', cols[i].text.strip()): brand = cols[i + 1].text if self.regex.isFoundPattern('(?i)Features:', cols[i].text.strip()): features = cols[i + 1].text if self.regex.isFoundPattern('(?i)Material:', cols[i].text.strip()): material = cols[i + 1].text if self.regex.isFoundPattern('(?i)Measurements:', cols[i].text.strip()): measurements = cols[i + 1].text if self.regex.isFoundPattern('(?i)Category:', cols[i].text.strip()): category = cols[i + 1].text if self.regex.isFoundPattern('(?i)Color:', cols[i].text.strip()): color = cols[i + 1].text if self.regex.isFoundPattern('(?i)Design:', cols[i].text.strip()): design = cols[i + 1].text if self.regex.isFoundPattern('(?i)Size:', cols[i].text.strip()): size = cols[i + 1].text self.notifyScrapper.emit('<font color=black><b>Writting data to csv file.</b></font>') csvData = [url, title, price, brand, features, material, measurements, category, size, color, design] self.notifyScrapper.emit('<font color=black><b>Data: %s</b></font>' % unicode(csvData)) self.csv.writeCsvRow(csvData) self.notifyScrapper.emit('<font color=black><b>Successfully Written data to csv file.</b></font>') total += 1 self.notifyScrapper.emit('<font color=green><b>Total Data scrapped: [%s]</b></font>' % str(total)) except Exception, x: self.notifyScrapper.emit('<font color=red><b>Error scrapping category: %s</b></font>' % x.message) self.logger.error(x.message) print x
class MyLinkedInMembers(QThread): notifyLinkedIn = pyqtSignal(object) notifyMembers = pyqtSignal(object) cookieL = pyqtSignal(object) def __init__(self, spider, url, pageRange=None): QThread.__init__(self) # self.spider = Spider() self.spider = spider self.regex = Regex() self.url = url self.startPage = None self.endPage = None if self.regex.isFoundPattern('(?i)(\d+)-(\d+)', str(pageRange).strip()): pageRangeFormat = self.regex.getSearchedDataGroups('(?i)(\d+)-(\d+)', str(pageRange).strip()) self.startPage = int(pageRangeFormat.group(1)) self.endPage = int(pageRangeFormat.group(2)) elif self.regex.isFoundPattern('(?i)(\d+)', str(pageRange).strip()): pageRangeFormat = self.regex.getSearchedDataGroups('(?i)(\d+)', str(pageRange).strip()) self.startPage = int(pageRangeFormat.group(1)) self.endPage = self.startPage def run(self): self.getMembers(self.url) self.notifyLinkedIn.emit('<font color=red><b>Finish scraping members.<b></font>') def getMembers(self, url, pageNumber=0): print 'Members URL: ' + url self.notifyLinkedIn.emit('<font color=green><b>Start Scraping All Members.<b></font>') self.notifyLinkedIn.emit('<b>Wait For 15 seconds Break...<b>') time.sleep(15) self.notifyLinkedIn.emit('<b>15 seconds Break Finish.<b>') groupData = self.spider.fetchData(str(url).replace('&', '&')) groupData = self.regex.reduceNewLine(groupData) groupData = self.regex.reduceBlankSpace(groupData) print groupData print 'page number: ' + str(pageNumber) if pageNumber > 0: harvestedMembers = [] allMembers = self.regex.getAllSearchedData('(?i)<li class="member" id="member-[^"]*"[^>]*?>(.*?)</div>', groupData) for members in allMembers: memberId = self.regex.getSearchedData('(?i)data-li-memberId="([^"]*)"', members) memberName = self.regex.getSearchedData('(?i)data-li-fullName="([^"]*)"', members) memberTitle = self.regex.getSearchedData('(?i)<p class="headline">([^<]*?)</p>', members) memberTitle = self.regex.replaceData('(?i)&', '&', memberTitle) harvestedMembers.append((memberId, memberName, memberTitle)) self.notifyLinkedIn.emit('<b>Member ID: </b>%s <b>Member Name: </b>%s' % (memberId, memberName + ' (' + memberTitle + ')')) # members = self.regex.getAllSearchedData( # '(?i)class="send-message" data-li-memberId="([^"]*)" data-li-fullName="([^"]*)"', groupData) # print members self.notifyMembers.emit(harvestedMembers) # for member in members: # print member # self.notifyLinkedIn.emit('<b>Member Name: </b>%s <b>Member ID: </b>%s' % (member[1], member[0])) urlNext = self.regex.getSearchedData('(?i)<a href="([^"]*)"[^>]*?>\s*?<strong>\s*?next', groupData) if urlNext and len(urlNext) > 0: # nextP = int(self.regex.getSearchedData('(?i).*?(\d+)$', urlNext.strip())) urlNext = self.regex.replaceData('(?i)&', '&', urlNext) urlNext = self.regex.replaceData('(?i)split_page=\d+', 'split_page=', urlNext) pageNumber += 1 if self.startPage <= pageNumber <= self.endPage: self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>') time.sleep(15) print 'sleep 15 s' self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>') self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber) elif pageNumber < self.startPage: pageNumber = self.startPage self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>') time.sleep(15) print 'page number less 0 sleep' self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>') self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber) if self.startPage is None and self.endPage is None: pageNumber += 1 self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>') time.sleep(15) print 'page number less 0 sleep' self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>') self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber)
class YoutubeScrapper(object): def __init__(self): self.logger = LogManager(__name__) self.spider = Spider() self.regex = Regex() self.utils = Utils() def scrapVideoDownloadUrl(self, url, filename=None): data = self.spider.fetchData(url) if data and len(data) > 0: title = self.scrapTitle(url) data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data) dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk) dlUrlParts = dlUrlChunk.split(',') sig = '' video = '' videoUrl = '' print dlUrlParts for dlUrlPart in dlUrlParts: dlUrlPart = urllib2.unquote(dlUrlPart) print dlUrlPart ## TODO if self.regex.isFoundPattern('(?i)itag=22', dlUrlPart) or self.regex.isFoundPattern('(?i)itag=18', dlUrlPart): urlPart = dlUrlPart.split(' ') for part in urlPart: print part if self.regex.isFoundPattern('(?i)sig=.*?', part): sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part) if self.regex.isFoundPattern('(?i)url=.*?', part): video = self.regex.getSearchedData('(?i)url=(.*?)$', part) print video videoUrl = video + '&signature=' + sig self.downloadDir = './natok.mp4' print 'Video URL= ' + videoUrl print self.downloadDir break # dlPath = './natok.mp4' if filename is None else filename fname = self.regex.replaceData('\s+', '_', title) dlPath = './' + fname + '.mp4' if filename is None else filename print dlPath print '\n\n' if self.downloadFile(videoUrl, dlPath) is True: print 'Download complete' else: print 'No data found.' def scrapTitle(self, url): # https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml' data = self.spider.fetchData(xmlUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) print data return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data) def downloadFile(self, url, downloadPath, retry=0): try: opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel=0), urllib2.HTTPSHandler(debuglevel=0)) opener.addheaders = [ ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Connection', 'keep-alive')] # resp = opener.open(url, timeout=10) resp = urllib2.urlopen(url, timeout=60) print 'ok' print resp.info() contentLength = resp.info()['Content-Length'] contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength) totalSize = float(contentLength) directory = os.path.dirname(downloadPath) if not os.path.exists(directory): os.makedirs(directory) currentSize = 0 dl_file = open(downloadPath, 'ab') try: if os.path.getsize(downloadPath): start = os.path.getsize(downloadPath) currentSize = start opener.addheaders.append(('Range', 'bytes=%s-' % (start))) except Exception, x: print x res = opener.open(url, timeout=60) CHUNK_SIZE = 256 * 1024 while True: data = res.read(CHUNK_SIZE) # data = resp.read(CHUNK_SIZE) if not data: break currentSize += len(data) dl_file.write(data) print('============> ' + \ str(round(float(currentSize * 100) / totalSize, 2)) + \ '% of ' + str(totalSize / (1024 * 1024)) + ' Mega Bytes') notifyDl = '===> Downloaded ' + str(round(float(currentSize * 100) / totalSize, 2)) + '% of ' + str( totalSize) + ' KB.' if currentSize >= totalSize: dl_file.close() return True except Exception, x: error = 'Error downloading: ' print x if retry < 20: time.sleep(30) return self.downloadFile(url, downloadPath, retry + 1)
class MyLinkedIn(QThread): notifyLinkedIn = pyqtSignal(object) notifyMember = pyqtSignal(object) cookieL = pyqtSignal(object) def __init__(self, username, password): QThread.__init__(self) self.spider = Spider() self.regex = Regex() self.username = username self.password = password def run(self): if self.login(): self.getAllGroups() def login(self): print "login start" self.notifyLinkedIn.emit("<b>Trying to login. Please wait...</b>") loginPageData = self.spider.fetchData("https://www.linkedin.com/uas/login?goback=&trk=hb_signin") loginPageData = self.regex.reduceNewLine(loginPageData) loginPageData = self.regex.reduceBlankSpace(loginPageData) ## <input type="hidden" name="session_redirect" value="" id="session_redirect-login"><input type="hidden" name="csrfToken" value="ajax:9073845200579364133" id="csrfToken-login"><input type="hidden" name="sourceAlias" value="0_7r5yezRXCiA_H0CRD8sf6DhOjTKUNps5xGTqeX8EEoi" id="sourceAlias-login"> self.sessionRedirect = self.regex.getSearchedData( '(?i)<input type="hidden" name="session_redirect" value="([^"]*)"', loginPageData ) self.token = self.regex.getSearchedData( '(?i)<input type="hidden" name="csrfToken" value="([^"]*)"', loginPageData ) self.alias = self.regex.getSearchedData( '(?i)<input type="hidden" name="sourceAlias" value="([^"]*)"', loginPageData ) loginParam = { "csrfToken": self.token, "isJsEnabled": "true", "session_key": self.username, "session_password": self.password, # 'session_key': '*****@*****.**', # 'session_password': '******', "session_redirect": self.sessionRedirect, "signin": "Sign In", "sourceAlias": self.alias, "source_app": "", } print loginParam print "start login" time.sleep(5) loginData = self.spider.login("https://www.linkedin.com/uas/login-submit", loginParam) loginData = self.regex.reduceNewLine(loginData) loginData = self.regex.reduceBlankSpace(loginData) # print loginData isLoggedIn = self.regex.isFoundPattern('(?i)<li class="signout">', loginData) if isLoggedIn: self.notifyLinkedIn.emit("<font color=green><b>Successfully Logged In.</b></font>") print "login success" self.cookieL.emit(self.spider) return True else: self.notifyLinkedIn.emit( "<font color=red><b>Something wrong with logging in. Please try again or check manually with this username/password</b></font>" ) return False def getAllGroups(self): print "start groups" self.notifyLinkedIn.emit("<font color=green><b>Start Scraping All Groups.</b></font>") self.notifyLinkedIn.emit("<b>Wait for 15 second break...</b>") time.sleep(15) self.notifyLinkedIn.emit("<b>15 second break finish!!!</b>") self.notifyLinkedIn.emit("<font color=green><b>Fetching data for scraping your groups.</b></font>") groupsUrl = "http://www.linkedin.com/myGroups?trk=hb_side_grps_top" groupsData = self.spider.fetchData(groupsUrl) self.notifyLinkedIn.emit("<font color=green><b>Data fetching complete for scraping your groups.</b></font>") if groupsData is not None and len(groupsData) > 0: print "starting groups" groupsData = self.regex.reduceNewLine(groupsData) groupsData = self.regex.reduceBlankSpace(groupsData) print groupsData ## <a href="/groups?gid=72881&trk=myg_ugrp_ovr" class="private" title="This group is members only">MySQL Professionals</a> groupInfo = self.regex.getAllSearchedData('(?i)<a href="(/groups\?gid=[^"]*)"[^>]*>([^<]*)</a>', groupsData) if groupInfo is not None and len(groupInfo) > 0: members = [] for group in groupInfo: groupUrl = "http://www.linkedin.com" + str(group[0]) groupName = str(group[1]) self.notifyLinkedIn.emit("<b>Group Name: </b>%s <b>URL: </b>%s" % (groupName, groupUrl)) # http://www.linkedin.com/groups?members=&gid=65688&trk=anet_ug_memb gid = self.regex.getSearchedData("(?i)gid=(\d+)", group[0]) print gid groupUrl = "http://www.linkedin.com/groups?members=&gid=" + gid + "&trk=anet_ug_memb" members.append((groupName, groupUrl)) self.notifyMember.emit(members) self.notifyLinkedIn.emit("<font color=red><b>Finish Scraping All Groups.</b></font>")
class MainForm(QMainWindow): def __init__(self, parent=None): super(MainForm, self).__init__(parent) self.regex = Regex() self.alreadyClickedA = False self.alreadyClickedB = False self.fileDir = None self.fileDirB = None self.fileName = None self.fileNameB = None self.totalUrlA = 0 self.totalUrlB = 0 self.currentUrlA = 0 self.currentUrlB = 0 self.pdfCounter = 1 self.pdfCounterB = 1 self.typeName = 'B' self.setupUI() def setupUI(self): self.isActionEvent = False ## Web URL self.labelUrl = QLabel( '<font size=4><b>Select text File with url List: </b></font>') self.labelUrl.setAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) self.labelUrl.setFixedWidth(200) self.btnUrlList = QPushButton('&Browse') self.btnUrlList.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnUrlList.setFixedWidth(100) self.btnUrlList.clicked.connect(self.urlListSelected) self.labelSelectedUrl = QLabel() self.labelSelectedUrl.setAlignment(QtCore.Qt.AlignLeft | QtCore.Qt.AlignVCenter) layoutUrl = QHBoxLayout() layoutUrl.addWidget(self.btnUrlList) layoutUrl.addWidget(self.labelSelectedUrl) ## File Path self.labelPdfPath = QLabel( '<font size=4><b>Select Pdf Path: </b></font>') self.labelPdfPath.setAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) self.labelPdfPath.setFixedWidth(200) self.btnOpenPdfDir = QPushButton('&Browse') self.btnOpenPdfDir.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnOpenPdfDir.setFixedWidth(100) self.btnOpenPdfDir.clicked.connect(self.pdfPathSelected) self.labelSelectedPath = QLabel() self.labelSelectedPath.setAlignment(QtCore.Qt.AlignLeft | QtCore.Qt.AlignVCenter) layoutPath = QHBoxLayout() layoutPath.addWidget(self.btnOpenPdfDir) layoutPath.addWidget(self.labelSelectedPath) self.labelGrouping = QLabel( '<font size=4><b>"Raw Numbering" and "Group Similar URLs" (A and B): </b></font>' ) self.labelGrouping.setAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) self.comboGrouping = QComboBox() self.comboGrouping.setFont(QFont('Helvetica', 8, QFont.Bold)) self.comboGrouping.setFixedWidth(100) self.comboGrouping.addItem('B') self.comboGrouping.addItem('A') self.comboGrouping.activated[str].connect(self.onActivated) layoutComboGrouping = QHBoxLayout() layoutComboGrouping.addWidget(self.comboGrouping) # layoutComboGrouping.addWidget(self.btnGroupingHelp) self.btnPrintPdf = QPushButton('&Start') self.btnPrintPdf.setFixedWidth(100) self.btnPrintPdf.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnPrintPdf.clicked.connect(self.printPdfAction) self.btnClear = QPushButton('&Clear Results') self.btnClear.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnClear.setFixedWidth(100) self.btnClear.clicked.connect(self.clearAll) self.btnGroupingHelp = QPushButton('&Help') self.btnGroupingHelp.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnGroupingHelp.setFixedWidth(100) self.btnGroupingHelp.clicked.connect(self.groupingHelpAction) layoutAction = QHBoxLayout() layoutAction.addWidget(self.btnPrintPdf) layoutAction.addWidget(self.btnClear) layoutAction.addWidget(self.btnGroupingHelp) layoutTop = QGridLayout() layoutTop.addWidget(self.labelUrl, 0, 0) layoutTop.addLayout(layoutUrl, 0, 1, Qt.AlignLeft) layoutTop.addWidget(self.labelPdfPath, 1, 0) layoutTop.addLayout(layoutPath, 1, 1, Qt.AlignLeft) # layoutTop.addWidget(self.labelGrouping, 2, 0) # layoutTop.addLayout(layoutComboGrouping, 2, 1, Qt.AlignLeft) # layoutTop.addWidget(self.btnClear, 3, 0, Qt.AlignRight) layoutTop.addLayout(layoutAction, 2, 1, Qt.AlignLeft) ## Bottom Portion self.labelProStatusA = QLabel() self.labelProStatusB = QLabel() self.labelWebAddress = QLabel('<b>Current URL Being Processed:</b>') self.lineEditWebAddress = QLineEdit() self.lineEditWebAddress.setReadOnly(True) self.labelStatus = QLabel('<b>Pdf Generation Status:</b>') self.textBrowserStatus = QTextBrowser() self.textBrowserStatus.setReadOnly(True) layout = QVBoxLayout() # layout.addLayout(layoutUrl) # layout.addLayout(layoutPath) layout.addLayout(layoutTop) layout.addWidget(self.labelProStatusA) layout.addWidget(self.labelProStatusB) layout.addWidget(self.labelWebAddress) layout.addWidget(self.lineEditWebAddress) layout.addWidget(self.labelStatus) layout.addWidget(self.textBrowserStatus) widget = QWidget() widget.setLayout(layout) self.setCentralWidget(widget) self.statusBar().showMessage(QString("Application Started...."), 500) self.setWindowTitle('PDF Batch Saver') self.setWindowFlags(Qt.WindowCloseButtonHint | Qt.WindowMinimizeButtonHint) screen = QDesktopWidget().screenGeometry() # self.setFixedSize((screen.width() / 2) + 150, (screen.height() / 2) + 150) self.resize((screen.width() / 2) + 150, (screen.height() / 2) + 150) def printPdfAction(self): if self.fileName is not None and self.fileDir is not None and self.alreadyClickedA is False and self.typeName == 'A': self.webToPdf = WebPageToPdf() self.webToPdf.threadPdfStatusBar.connect(self.showStatus) self.webToPdf.threadPdfWritingStatus.connect(self.appendStatus) self.webToPdf.threadPdfWritingDone.connect(self.pdfGenFinished) f = open(self.fileName, 'rb') self.lists = f.readlines() f.close() self.totalUrlA = len(self.lists) self.alreadyClickedA = True self.pdfGenFinished() elif self.fileNameB is not None and self.fileDirB is not None and self.alreadyClickedB is False and self.typeName == 'B': self.webToPdfB = WebPageToPdf() self.webToPdfB.threadPdfStatusBar.connect(self.showStatus) self.webToPdfB.threadPdfWritingStatus.connect(self.appendStatus) self.webToPdfB.threadPdfWritingDone.connect(self.pdfGenFinishedB) f = open(self.fileNameB, 'rb') self.listsB = f.readlines() f.close() pdfFiles = [ f for f in os.listdir(self.fileDirB) if f.endswith('.pdf') ] if len(pdfFiles) > 0: self.pdfCounterB = int( self.regex.getSearchedData('(?i)^(\d+)_', pdfFiles[-1])) + 1 self.totalUrlB = len(self.listsB) self.alreadyClickedB = True self.startTime = time.clock() self.pdfGenFinishedB() else: QMessageBox.warning( None, 'Warning', 'Please Select your URL List and PDF writing Path.') def pdfGenFinished(self): if self.lists is not None and len(self.lists) > 0: self.currentUrlA += 1 url = self.lists.pop(0) self.lineEditWebAddress.setText(url) url = url.strip() self.labelProStatusA.setText( '<font color="green" size=4><b>For grouping "A": <u> %s </u> total items in the batch, processing <u> %s </u> out of <u> %s </u></b></font>' % (str(self.totalUrlA), str( self.currentUrlA), str(self.totalUrlA))) pdfFile = str(url).split('/')[-1] print 'pdf file : ' + pdfFile pdfFile = self.regex.getSearchedData( '(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', pdfFile) pdfFiles = [ f for f in os.listdir(self.fileDir) if f.endswith('.pdf') ] finalPdfFile = '' i = 2 for file in pdfFiles: if self.regex.isFoundPattern('(?i)' + pdfFile, file): index = self.regex.getSearchedData('(?i)(\d+).*?$', file) finalPdfFile = str(index) + '_' + str( pdfFile) + '_copy_' + str(i) + '.pdf' i += 1 if len(finalPdfFile) is 0: finalPdfFile = str(self.pdfCounter) + '_' + pdfFile + '.pdf' else: self.pdfCounter -= 1 self.webToPdf.printWebHtmlToPdf(url, self.fileDir + '/', finalPdfFile, 'A') self.pdfCounter += 1 else: self.showStatus('Pdf Generation Completed') self.alreadyClicked = False self.totalUrlA = 0 self.currentUrlA = 0 # self.labelProStatusA.setText('') def pdfGenFinishedB(self): if self.listsB is not None and len(self.listsB) > 0: self.currentUrlB += 1 url = self.listsB.pop(0) self.lineEditWebAddress.setText(url) url = url.strip() # self.labelProStatusB.setText( # '<font color="green" size=4><b>For grouping "B": <u> %s </u> total items in the batch, processing <u> %s </u> out of <u> %s </u></b></font>' % ( # str( # self.totalUrlB), str(self.currentUrlB), str(self.totalUrlB))) elapsedTime = time.clock() - self.startTime print elapsedTime self.labelProStatusB.setText( '<font size=4><b>URL <u> %s </u> of <u> %s </u> being processed. Time elapsed: %s</b></font>' % (str(self.currentUrlB), str(self.totalUrlB), str(time.strftime('%H:%M:%S', time.gmtime(elapsedTime))))) pdfFile = str(url).split('/')[-1] print 'pdf file : ' + pdfFile # pdfFile = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_. ]*)$', url) pdfFile = self.regex.getSearchedData( '(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', pdfFile) pdfFiles = [ f for f in os.listdir(self.fileDirB) if f.endswith('.pdf') ] # self.pdfCounterB = int(self.regex.getSearchedData('(?i)^(\d+)_', pdfFiles[-1])) finalPdfFile = '' i = 2 for file in pdfFiles: if self.regex.isFoundPattern('(?i)' + pdfFile, file): finalPdfFile = str(self.pdfCounterB) + '_' + str( pdfFile) + '_copy_' + str(i) + '.pdf' i += 1 if len(finalPdfFile) is 0: finalPdfFile = str(self.pdfCounterB) + '_' + pdfFile + '.pdf' self.webToPdfB.printWebHtmlToPdf(url, self.fileDirB + '/', finalPdfFile, 'B') self.labelProStatusB.setText( '<font size=4><b>URL <u> %s </u> of <u> %s </u> being processed. Time elapsed: %s</b></font>' % (str(self.currentUrlB), str(self.totalUrlB), str(time.strftime('%H:%M:%S', time.gmtime(elapsedTime))))) self.pdfCounterB += 1 else: self.showStatus('Pdf Generation Completed') self.alreadyClickedB = False self.totalUrlB = 0 self.currentUrlB = 0 self.fileDirB = None self.fileNameB = None # self.labelProStatusB.setText('') def urlListSelected(self): if self.typeName == 'A': self.fileName = QtGui.QFileDialog.getOpenFileName( self, "Select Text File", QDir.homePath() + "/Desktop") if self.typeName == 'B': self.fileNameB = QtGui.QFileDialog.getOpenFileName( self, "Select Text File", QDir.homePath() + "/Desktop") self.labelSelectedUrl.setText('<b>%s</b>' % str(self.fileNameB)) def pdfPathSelected(self): if self.typeName == 'A': self.fileDir = QtGui.QFileDialog.getExistingDirectory( self, "Select Directory", QDir.homePath() + "/Desktop") self.pdfCounter = 1 if self.typeName == 'B': self.fileDirB = QtGui.QFileDialog.getExistingDirectory( self, "Select Directory", QDir.homePath() + "/Desktop") self.pdfCounterB = 1 self.labelSelectedPath.setText('<b>%s</b>' % str(self.fileDirB)) def onActivated(self, text): self.typeName = text self.pdfCounter = 1 def clearAll(self): self.lineEditWebAddress.clear() self.textBrowserStatus.clear() self.statusBar().showMessage('') self.pdfCounterB = 1 self.labelProStatusB.setText('') self.fileDirB = None self.fileNameB = None def groupingHelpAction(self): QMessageBox.information( None, 'Help Message', 'This program reads a text file of URLs and produces a series of PDFs. If the source text file contains more than one listing of the same URL, the program will create an extra copy of the PDF anyway in the output folder.' ) def appendStatus(self, data): self.textBrowserStatus.append(data) def showStatus(self, data): self.statusBar().showMessage(data)
class MyLinkedIn(QThread): notifyLinkedIn = pyqtSignal(object) notifyMember = pyqtSignal(object) cookieL = pyqtSignal(object) def __init__(self, username, password): QThread.__init__(self) self.spider = Spider() self.regex = Regex() self.username = username self.password = password def run(self): if self.login(): self.getAllGroups() def login(self): print 'login start' self.notifyLinkedIn.emit('<b>Trying to login. Please wait...</b>') loginPageData = self.spider.fetchData( 'https://www.linkedin.com/uas/login?goback=&trk=hb_signin') loginPageData = self.regex.reduceNewLine(loginPageData) loginPageData = self.regex.reduceBlankSpace(loginPageData) ## <input type="hidden" name="session_redirect" value="" id="session_redirect-login"><input type="hidden" name="csrfToken" value="ajax:9073845200579364133" id="csrfToken-login"><input type="hidden" name="sourceAlias" value="0_7r5yezRXCiA_H0CRD8sf6DhOjTKUNps5xGTqeX8EEoi" id="sourceAlias-login"> self.sessionRedirect = self.regex.getSearchedData( '(?i)<input type="hidden" name="session_redirect" value="([^"]*)"', loginPageData) self.token = self.regex.getSearchedData( '(?i)<input type="hidden" name="csrfToken" value="([^"]*)"', loginPageData) self.alias = self.regex.getSearchedData( '(?i)<input type="hidden" name="sourceAlias" value="([^"]*)"', loginPageData) loginParam = { 'csrfToken': self.token, 'isJsEnabled': 'true', 'session_key': self.username, 'session_password': self.password, # 'session_key': '*****@*****.**', # 'session_password': '******', 'session_redirect': self.sessionRedirect, 'signin': 'Sign In', 'sourceAlias': self.alias, 'source_app': '' } print loginParam print 'start login' time.sleep(5) loginData = self.spider.login( 'https://www.linkedin.com/uas/login-submit', loginParam) loginData = self.regex.reduceNewLine(loginData) loginData = self.regex.reduceBlankSpace(loginData) # print loginData isLoggedIn = self.regex.isFoundPattern('(?i)<li class="signout">', loginData) if isLoggedIn: self.notifyLinkedIn.emit( '<font color=green><b>Successfully Logged In.</b></font>') print 'login success' self.cookieL.emit(self.spider) return True else: self.notifyLinkedIn.emit( '<font color=red><b>Something wrong with logging in. Please try again or check manually with this username/password</b></font>' ) return False def getAllGroups(self): print 'start groups' self.notifyLinkedIn.emit( '<font color=green><b>Start Scraping All Groups.</b></font>') self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>') time.sleep(15) self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>') self.notifyLinkedIn.emit( '<font color=green><b>Fetching data for scraping your groups.</b></font>' ) groupsUrl = 'http://www.linkedin.com/myGroups?trk=hb_side_grps_top' groupsData = self.spider.fetchData(groupsUrl) self.notifyLinkedIn.emit( '<font color=green><b>Data fetching complete for scraping your groups.</b></font>' ) if groupsData is not None and len(groupsData) > 0: print 'starting groups' groupsData = self.regex.reduceNewLine(groupsData) groupsData = self.regex.reduceBlankSpace(groupsData) print groupsData ## <a href="/groups?gid=72881&trk=myg_ugrp_ovr" class="private" title="This group is members only">MySQL Professionals</a> groupInfo = self.regex.getAllSearchedData( '(?i)<a href="(/groups\?gid=[^"]*)"[^>]*>([^<]*)</a>', groupsData) if groupInfo is not None and len(groupInfo) > 0: members = [] for group in groupInfo: groupUrl = 'http://www.linkedin.com' + str(group[0]) groupName = str(group[1]) self.notifyLinkedIn.emit( '<b>Group Name: </b>%s <b>URL: </b>%s' % (groupName, groupUrl)) # http://www.linkedin.com/groups?members=&gid=65688&trk=anet_ug_memb gid = self.regex.getSearchedData('(?i)gid=(\d+)', group[0]) print gid groupUrl = 'http://www.linkedin.com/groups?members=&gid=' + gid + '&trk=anet_ug_memb' members.append((groupName, groupUrl)) self.notifyMember.emit(members) self.notifyLinkedIn.emit( '<font color=red><b>Finish Scraping All Groups.</b></font>')
class YtDownloadManager(object): def __init__(self): self.spider = Spider() self.regex = Regex() self.utils = Utils() def scrapVideoDownloadUrl(self, url): data = self.spider.fetchData(url) print data soup = BeautifulSoup(data) exit(1) if data and len(data) > 0: title = self.scrapTitle(url) data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data) dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk) dlUrlParts = dlUrlChunk.split(',') sig = '' video = '' videoUrl = '' print dlUrlParts for dlUrlPart in dlUrlParts: dlUrlPart = urllib2.unquote(dlUrlPart) print dlUrlPart # if self.regex.isFoundPattern('(?i)itag=5', dlUrlPart): urlPart = dlUrlPart.split(' ') for part in urlPart: print part if self.regex.isFoundPattern('(?i)sig=.*?', part): sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part) if self.regex.isFoundPattern('(?i)url=.*?', part): video = self.regex.getSearchedData('(?i)url=(.*?)$', part) print video videoUrl = video + '&signature=' + sig self.downloadDir = './test.flv' # print 'Video URL= ' + videoUrl # print self.downloadDir # dlPath = './test.flv' # print dlPath print '\n\n' # if self.downloadFile(videoUrl, dlPath) is True: # break def scrapTitle(self, url): # https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml' data = self.spider.fetchData(xmlUrl) if data and len(data) > 0: data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data) def downloadFile(self, url, downloadPath, retry=0): try: opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel=0), urllib2.HTTPSHandler(debuglevel=0)) opener.addheaders = [ ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Connection', 'keep-alive')] # resp = opener.open(url, timeout=10) resp = urllib2.urlopen(url, timeout=30) print resp.info() contentLength = resp.info()['Content-Length'] contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength) totalSize = float(contentLength) directory = os.path.dirname(downloadPath) if not os.path.exists(directory): os.makedirs(directory) dl_file = open(downloadPath, 'wb') currentSize = 0 CHUNK_SIZE = 32768 while True: data = resp.read(CHUNK_SIZE) if not data: break currentSize += len(data) dl_file.write(data) print('============> ' + \ str(round(float(currentSize * 100) / totalSize, 2)) + \ '% of ' + str(totalSize) + ' bytes') notifyDl = '===> Downloaded ' + str(round(float(currentSize * 100) / totalSize, 2)) + '% of ' + str( totalSize) + ' KB.' if currentSize >= totalSize: dl_file.close() return True except Exception, x: error = 'Error downloading: ' + x return False
class MyLinkedInMessage(QThread): notifyLinkedIn = pyqtSignal(object) def __init__(self, spider, memberList, subject, message): QThread.__init__(self) # self.spider = Spider() self.spider = spider self.regex = Regex() self.memberList = memberList self.subject = unicode(subject) self.message = unicode(message) def run(self): self.sendMessage() self.notifyLinkedIn.emit('<font color=red><b>Finish Sending All Messages.</b></font>') def sendMessage(self): print self.memberList for member in self.memberList: messageUrl = 'http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=' + str(member[1]) print messageUrl # messageUrl = 'http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=' + '65471931' # data = self.spider.fetchData('http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=65471931') data = self.spider.fetchData(messageUrl) data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) fromName = self.regex.getSearchedData('(?i)<input type="hidden" name="fromName" value="([^"]*)"', data) fromEmail = self.regex.getSearchedData('(?i)<input type="hidden" name="fromEmail" value="([^"]*)"', data) # connectionIds = self.regex.getSearchedData('(?i)<input type="hidden" name="connectionIds" value="([^"]*)"', data) csrfToken = self.regex.getSearchedData('(?i)<input type="hidden" name="csrfToken" value="([^"]*)"', data) sourceAlias = self.regex.getSearchedData('(?i)<input type="hidden" name="sourceAlias" value="([^"]*)"', data) linkedInSubject = u'Hi ' + unicode(member[0]).split(' ')[0] + self.subject linkedInMessage = u'Hi ' + unicode(member[0]).split(' ')[0] + u',\n' + self.message print linkedInMessage params = {'addMoreRcpts': 'false', 'ajaxSubmit': 'Send Message', 'allowEditRcpts': 'true', 'body': linkedInMessage, 'connectionIds': str(member[1]), 'connectionNames': '', 'csrfToken': csrfToken, 'fromEmail': fromEmail, 'fromName': fromName, 'itemID': '', 'openSocialAppBodySuffix': '', 'showRecipeints': 'showRecipeints', 'sourceAlias': sourceAlias, 'st': '', 'subject': linkedInSubject, 'submit': 'Send Message', 'viewerDestinationUrl': ''} print params msgUrl = 'http://www.linkedin.com/msgToConns?displayCreate=' data = self.spider.fetchData(msgUrl, params) data = self.regex.reduceNewLine(data) data = self.regex.reduceBlankSpace(data) if self.regex.isFoundPattern('(?i)<div class="alert success">', data): print 'Message Sent.' self.notifyLinkedIn.emit('<font color=green><b>Successfully Sent Message To: %s</b></font>' % member[0]) else: self.notifyLinkedIn.emit('<font color=red><b>Something Wrong during Send Message To</b></font>' % member[0]) # params = {'addMoreRcpts': 'false', # 'ajaxSubmit': 'Send Message', # 'allowEditRcpts': 'true', # 'body': 'Script Test', # 'connectionIds': '65471931', # 'connectionNames': '', # 'csrfToken': 'ajax: 6539671039643459056', # 'fromEmail': '467728216', # 'fromName': 'Mehedi Hasan', # 'itemID': '', # 'openSocialAppBodySuffix': '', # 'showRecipeints': 'showRecipeints', # 'sourceAlias': '0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn', # 'st': '', # 'subject': 'Script Test', # 'submit': 'Send Message', # 'viewerDestinationUrl': ''} #<input type="hidden" name="fromName" value="Mehedi Hasan" id="fromName-msgForm"> # <input type="hidden" name="showRecipeints" value="showRecipeints" id="showRecipeints-msgForm"> # <input type="hidden" name="fromEmail" value="467728216" id="fromEmail-msgForm"> # <input type="hidden" name="connectionIds" value="65471931" id="connectionIds-msgForm"> # <input type="hidden" name="connectionNames" value="" id="connectionNames-msgForm"> # <input type="hidden" name="allowEditRcpts" value="true" id="allowEditRcpts-msgForm"> # <input type="hidden" name="addMoreRcpts" value="false" id="addMoreRcpts-msgForm"> # <input type="hidden" name="itemID" value="" id="itemID-msgForm"> # <input type="hidden" name="openSocialAppBodySuffix" value="" id="openSocialAppBodySuffix-msgForm"> # <input type="hidden" name="st" value="" id="st-msgForm"> # <input type="hidden" name="viewerDestinationUrl" value="" id="viewerDestinationUrl-msgForm"> # <input type="hidden" name="csrfToken" value="ajax:6539671039643459056" id="csrfToken-msgForm"> # <input type="hidden" name="sourceAlias" value="0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn" id="sourceAlias-msgForm"> """ msgUrl1 = 'http://www.linkedin.com/msgToConns?displayCreate=' msgParams = {} addMoreRcpts false ajaxSubmit Send Message allowEditRcpts true body fdgdfgdfgdfg dg d connectionIds 57414219 connectionNames csrfToken ajax:3480949306085123249 fromEmail 467728216 fromName Mehedi Hasan goback .con.npv_57414219_*1_*1_name_r5tN_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1 itemID openSocialAppBodySuffix showRecipeints showRecipeints sourceAlias 0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn st subject viewerDestinationUrl """ """addMoreRcpts false
class MainForm(QMainWindow): def __init__(self, parent=None): super(MainForm, self).__init__(parent) self.regex = Regex() self.alreadyClickedA = False self.alreadyClickedB = False self.fileDir = None self.fileDirB = None self.fileName = None self.fileNameB = None self.totalUrlA = 0 self.totalUrlB = 0 self.currentUrlA = 0 self.currentUrlB = 0 self.pdfCounter = 1 self.pdfCounterB = 1 self.typeName = 'B' self.setupUI() def setupUI(self): self.isActionEvent = False ## Web URL self.labelUrl = QLabel('<font size=4><b>Select text File with url List: </b></font>') self.labelUrl.setAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) self.labelUrl.setFixedWidth(200) self.btnUrlList = QPushButton('&Browse') self.btnUrlList.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnUrlList.setFixedWidth(100) self.btnUrlList.clicked.connect(self.urlListSelected) self.labelSelectedUrl = QLabel() self.labelSelectedUrl.setAlignment(QtCore.Qt.AlignLeft | QtCore.Qt.AlignVCenter) layoutUrl = QHBoxLayout() layoutUrl.addWidget(self.btnUrlList) layoutUrl.addWidget(self.labelSelectedUrl) ## File Path self.labelPdfPath = QLabel('<font size=4><b>Select Pdf Path: </b></font>') self.labelPdfPath.setAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) self.labelPdfPath.setFixedWidth(200) self.btnOpenPdfDir = QPushButton('&Browse') self.btnOpenPdfDir.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnOpenPdfDir.setFixedWidth(100) self.btnOpenPdfDir.clicked.connect(self.pdfPathSelected) self.labelSelectedPath = QLabel() self.labelSelectedPath.setAlignment(QtCore.Qt.AlignLeft | QtCore.Qt.AlignVCenter) layoutPath = QHBoxLayout() layoutPath.addWidget(self.btnOpenPdfDir) layoutPath.addWidget(self.labelSelectedPath) self.labelGrouping = QLabel('<font size=4><b>"Raw Numbering" and "Group Similar URLs" (A and B): </b></font>') self.labelGrouping.setAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) self.comboGrouping = QComboBox() self.comboGrouping.setFont(QFont('Helvetica', 8, QFont.Bold)) self.comboGrouping.setFixedWidth(100) self.comboGrouping.addItem('B') self.comboGrouping.addItem('A') self.comboGrouping.activated[str].connect(self.onActivated) layoutComboGrouping = QHBoxLayout() layoutComboGrouping.addWidget(self.comboGrouping) # layoutComboGrouping.addWidget(self.btnGroupingHelp) self.btnPrintPdf = QPushButton('&Start') self.btnPrintPdf.setFixedWidth(100) self.btnPrintPdf.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnPrintPdf.clicked.connect(self.printPdfAction) self.btnClear = QPushButton('&Clear Results') self.btnClear.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnClear.setFixedWidth(100) self.btnClear.clicked.connect(self.clearAll) self.btnGroupingHelp = QPushButton('&Help') self.btnGroupingHelp.setFont(QFont('Helvetica', 8, QFont.Bold)) self.btnGroupingHelp.setFixedWidth(100) self.btnGroupingHelp.clicked.connect(self.groupingHelpAction) layoutAction = QHBoxLayout() layoutAction.addWidget(self.btnPrintPdf) layoutAction.addWidget(self.btnClear) layoutAction.addWidget(self.btnGroupingHelp) layoutTop = QGridLayout() layoutTop.addWidget(self.labelUrl, 0, 0) layoutTop.addLayout(layoutUrl, 0, 1, Qt.AlignLeft) layoutTop.addWidget(self.labelPdfPath, 1, 0) layoutTop.addLayout(layoutPath, 1, 1, Qt.AlignLeft) # layoutTop.addWidget(self.labelGrouping, 2, 0) # layoutTop.addLayout(layoutComboGrouping, 2, 1, Qt.AlignLeft) # layoutTop.addWidget(self.btnClear, 3, 0, Qt.AlignRight) layoutTop.addLayout(layoutAction, 2, 1, Qt.AlignLeft) ## Bottom Portion self.labelProStatusA = QLabel() self.labelProStatusB = QLabel() self.labelWebAddress = QLabel('<b>Current URL Being Processed:</b>') self.lineEditWebAddress = QLineEdit() self.lineEditWebAddress.setReadOnly(True) self.labelStatus = QLabel('<b>Pdf Generation Status:</b>') self.textBrowserStatus = QTextBrowser() self.textBrowserStatus.setReadOnly(True) layout = QVBoxLayout() # layout.addLayout(layoutUrl) # layout.addLayout(layoutPath) layout.addLayout(layoutTop) layout.addWidget(self.labelProStatusA) layout.addWidget(self.labelProStatusB) layout.addWidget(self.labelWebAddress) layout.addWidget(self.lineEditWebAddress) layout.addWidget(self.labelStatus) layout.addWidget(self.textBrowserStatus) widget = QWidget() widget.setLayout(layout) self.setCentralWidget(widget) self.statusBar().showMessage(QString("Application Started...."), 500) self.setWindowTitle('PDF Batch Saver') self.setWindowFlags(Qt.WindowCloseButtonHint | Qt.WindowMinimizeButtonHint) screen = QDesktopWidget().screenGeometry() # self.setFixedSize((screen.width() / 2) + 150, (screen.height() / 2) + 150) self.resize((screen.width() / 2) + 150, (screen.height() / 2) + 150) def printPdfAction(self): if self.fileName is not None and self.fileDir is not None and self.alreadyClickedA is False and self.typeName == 'A': self.webToPdf = WebPageToPdf() self.webToPdf.threadPdfStatusBar.connect(self.showStatus) self.webToPdf.threadPdfWritingStatus.connect(self.appendStatus) self.webToPdf.threadPdfWritingDone.connect(self.pdfGenFinished) f = open(self.fileName, 'rb') self.lists = f.readlines() f.close() self.totalUrlA = len(self.lists) self.alreadyClickedA = True self.pdfGenFinished() elif self.fileNameB is not None and self.fileDirB is not None and self.alreadyClickedB is False and self.typeName == 'B': self.webToPdfB = WebPageToPdf() self.webToPdfB.threadPdfStatusBar.connect(self.showStatus) self.webToPdfB.threadPdfWritingStatus.connect(self.appendStatus) self.webToPdfB.threadPdfWritingDone.connect(self.pdfGenFinishedB) f = open(self.fileNameB, 'rb') self.listsB = f.readlines() f.close() pdfFiles = [f for f in os.listdir(self.fileDirB) if f.endswith('.pdf')] if len(pdfFiles) > 0: self.pdfCounterB = int(self.regex.getSearchedData('(?i)^(\d+)_', pdfFiles[-1])) + 1 self.totalUrlB = len(self.listsB) self.alreadyClickedB = True self.startTime = time.clock() self.pdfGenFinishedB() else: QMessageBox.warning(None, 'Warning', 'Please Select your URL List and PDF writing Path.') def pdfGenFinished(self): if self.lists is not None and len(self.lists) > 0: self.currentUrlA += 1 url = self.lists.pop(0) self.lineEditWebAddress.setText(url) url = url.strip() self.labelProStatusA.setText( '<font color="green" size=4><b>For grouping "A": <u> %s </u> total items in the batch, processing <u> %s </u> out of <u> %s </u></b></font>' % ( str( self.totalUrlA), str(self.currentUrlA), str(self.totalUrlA))) pdfFile = str(url).split('/')[-1] print 'pdf file : ' + pdfFile pdfFile = self.regex.getSearchedData('(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', pdfFile) pdfFiles = [f for f in os.listdir(self.fileDir) if f.endswith('.pdf')] finalPdfFile = '' i = 2 for file in pdfFiles: if self.regex.isFoundPattern('(?i)' + pdfFile, file): index = self.regex.getSearchedData('(?i)(\d+).*?$', file) finalPdfFile = str(index) + '_' + str(pdfFile) + '_copy_' + str(i) + '.pdf' i += 1 if len(finalPdfFile) is 0: finalPdfFile = str(self.pdfCounter) + '_' + pdfFile + '.pdf' else: self.pdfCounter -= 1 self.webToPdf.printWebHtmlToPdf(url, self.fileDir + '/', finalPdfFile, 'A') self.pdfCounter += 1 else: self.showStatus('Pdf Generation Completed') self.alreadyClicked = False self.totalUrlA = 0 self.currentUrlA = 0 # self.labelProStatusA.setText('') def pdfGenFinishedB(self): if self.listsB is not None and len(self.listsB) > 0: self.currentUrlB += 1 url = self.listsB.pop(0) self.lineEditWebAddress.setText(url) url = url.strip() # self.labelProStatusB.setText( # '<font color="green" size=4><b>For grouping "B": <u> %s </u> total items in the batch, processing <u> %s </u> out of <u> %s </u></b></font>' % ( # str( # self.totalUrlB), str(self.currentUrlB), str(self.totalUrlB))) elapsedTime = time.clock() - self.startTime print elapsedTime self.labelProStatusB.setText( '<font size=4><b>URL <u> %s </u> of <u> %s </u> being processed. Time elapsed: %s</b></font>' % ( str(self.currentUrlB), str(self.totalUrlB), str(time.strftime('%H:%M:%S', time.gmtime(elapsedTime))))) pdfFile = str(url).split('/')[-1] print 'pdf file : ' + pdfFile # pdfFile = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_. ]*)$', url) pdfFile = self.regex.getSearchedData('(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', pdfFile) pdfFiles = [f for f in os.listdir(self.fileDirB) if f.endswith('.pdf')] # self.pdfCounterB = int(self.regex.getSearchedData('(?i)^(\d+)_', pdfFiles[-1])) finalPdfFile = '' i = 2 for file in pdfFiles: if self.regex.isFoundPattern('(?i)' + pdfFile, file): finalPdfFile = str(self.pdfCounterB) + '_' + str(pdfFile) + '_copy_' + str(i) + '.pdf' i += 1 if len(finalPdfFile) is 0: finalPdfFile = str(self.pdfCounterB) + '_' + pdfFile + '.pdf' self.webToPdfB.printWebHtmlToPdf(url, self.fileDirB + '/', finalPdfFile, 'B') self.labelProStatusB.setText( '<font size=4><b>URL <u> %s </u> of <u> %s </u> being processed. Time elapsed: %s</b></font>' % ( str(self.currentUrlB), str(self.totalUrlB), str(time.strftime('%H:%M:%S', time.gmtime(elapsedTime))))) self.pdfCounterB += 1 else: self.showStatus('Pdf Generation Completed') self.alreadyClickedB = False self.totalUrlB = 0 self.currentUrlB = 0 self.fileDirB = None self.fileNameB = None # self.labelProStatusB.setText('') def urlListSelected(self): if self.typeName == 'A': self.fileName = QtGui.QFileDialog.getOpenFileName(self, "Select Text File", QDir.homePath() + "/Desktop") if self.typeName == 'B': self.fileNameB = QtGui.QFileDialog.getOpenFileName(self, "Select Text File", QDir.homePath() + "/Desktop") self.labelSelectedUrl.setText('<b>%s</b>' % str(self.fileNameB)) def pdfPathSelected(self): if self.typeName == 'A': self.fileDir = QtGui.QFileDialog.getExistingDirectory(self, "Select Directory", QDir.homePath() + "/Desktop") self.pdfCounter = 1 if self.typeName == 'B': self.fileDirB = QtGui.QFileDialog.getExistingDirectory(self, "Select Directory", QDir.homePath() + "/Desktop") self.pdfCounterB = 1 self.labelSelectedPath.setText('<b>%s</b>' % str(self.fileDirB)) def onActivated(self, text): self.typeName = text self.pdfCounter = 1 def clearAll(self): self.lineEditWebAddress.clear() self.textBrowserStatus.clear() self.statusBar().showMessage('') self.pdfCounterB = 1 self.labelProStatusB.setText('') self.fileDirB = None self.fileNameB = None def groupingHelpAction(self): QMessageBox.information(None, 'Help Message', 'This program reads a text file of URLs and produces a series of PDFs. If the source text file contains more than one listing of the same URL, the program will create an extra copy of the PDF anyway in the output folder.') def appendStatus(self, data): self.textBrowserStatus.append(data) def showStatus(self, data): self.statusBar().showMessage(data)