Example #1
0
def main():
    stats = youtube.grabYouTubeSample(100)
    #print len(videos)
    print len(stats)
    #print videos
    print stats
    print "FINISHED COLLECTION"
    print "STARTING PARSING"
    TitleParser.parse_videos(stats)
    print "COMPUTING AVERAGES"
    TitleParser.compute_average_views()
    print "COMPLETE"
Example #2
0
def main():
    stats = youtube.grabYouTubeSample(100)
    #print len(videos)
    print len(stats)
    #print videos
    print stats
    print "FINISHED COLLECTION"
    print "STARTING PARSING"
    TitleParser.parse_videos(stats)
    print "COMPUTING AVERAGES"
    TitleParser.compute_average_views()
    print "COMPLETE"
Example #3
0
def process_stream(submission, reddit):
        title = submission.title

        if 'DISC' not in title.upper():
            return

        title = title.upper()
        print(db.find_subscribers(tp.title(title)))
        print(title)
        for manga in db.all_manga():
            if manga in title:
                send_alert(submission, reddit.redditor(config.REDDITOR))
                print("alert {}".format(title))
                # title = manga
                continue
    def ScrapeLinkset(self):
        loopCount = 1
        successCount = 1
        captionContent = ''
        titleContent = ''
        srcContent = ''
        doesTitleExist = False

        for link in self.__links:
            self._browser.get(link)
            imageName = self.__keyword.replace(
                " ", "_") + '_%d.jpg' % (successCount)
            print('(%d/%d): ' % (loopCount, len(self.__links)) + link)
            loopCount += 1

            # Getting image download links
            image = self.__wait.until(
                EC.presence_of_element_located((
                    By.CSS_SELECTOR,
                    "div[class='Pj7 sLG XiG eEj m1e'] > div[class='XiG zI7 iyn Hsu'] > img"
                )))

            try:
                print('Initial request: ' + image.get_attribute('src'))
                imageLink = self.__GetHighResImage(image.get_attribute('src'))
                print('Final Request: ' + imageLink)

                if (ImageFilter.IsImageGreaterThanBounds(
                        imageLink, self.__horizontalMin, self.__verticalMin)):
                    # Get title
                    try:
                        title = self.__wait.until(
                            EC.presence_of_element_located(
                                (By.CSS_SELECTOR,
                                 "h1[class='lH1 dyH iFc ky3 pBj DrD IZT']")))
                        titleContent = title.text
                        doesTitleExist = True
                    except (TimeoutException):
                        doesTitleExist = False
                        titleContent = 'N/A'
                    print('\nTitle content:\n\n' + titleContent)

                    # Get source
                    try:
                        source = self.__wait.until(
                            EC.presence_of_element_located((
                                By.CSS_SELECTOR,
                                "div[class='Jea jzS zI7 iyn Hsu'] a[class='linkModuleActionButton']"
                            )))
                        srcContent = source.get_attribute('href')
                    except:
                        srcContent = 'N/A'
                    print('\nSource content:\n\n' + srcContent)

                    # Get caption
                    print('\nCaption content:\n')
                    try:
                        caption = self.__wait.until(
                            EC.presence_of_element_located((
                                By.CSS_SELECTOR,
                                "span[class='tBJ dyH iFc MF7 pBj DrD IZT swG']"
                            )))
                        captionContent = caption.text
                    except (TimeoutException):
                        captionContent = 'N/A'

                    print(captionContent)

                    if (titleContent == 'N/A' and srcContent != 'N/A'):
                        try:
                            titleContent = TitleParser.GetTitle(srcContent)
                        except:
                            titleContent = 'N/A'

                    # Write image to directory
                    imageSuccess = self.__DownloadImage(imageLink, imageName)
                    # Write caption to captions.txt in directory
                    if (imageSuccess):
                        successCount += 1
                        captionSuccess = self.__WriteToMetadataFile(
                            imageName, titleContent, srcContent,
                            captionContent)
                        if (captionSuccess):
                            if (not doesTitleExist):
                                self.__WriteToCSVFile(imageName,
                                                      captionContent[0:20],
                                                      link)
                            else:
                                self.__WriteToCSVFile(imageName,
                                                      titleContent[0:20], link)
                else:
                    print('Image not greater than bounds: ' + imageLink)
            except:
                print('No image found (src = NULL)')

            print()
            print()
Example #5
0
                row['viewCount'] = '1,000,000+'
            data.append(row)

    with open("Fixed" + readFile, 'w', encoding='utf8',
              newline='') as output_file:
        keys = data[0].keys()
        writer = csv.DictWriter(output_file, keys)
        writer.writeheader()
        writer.writerows(data)


def fixNullByte():
    readFile = 'YouTubeData5.csv'
    data = ''

    with open(readFile) as fd:
        data = fd.read()

    with open("NoNull" + readFile, 'w') as fo:
        fo.write(data.replace('\x00', ''))


#fixNullByte()
#fixViewCount()
s = grabYouTubeSample()
TitleParser.parse_videos(s)
TitleParser.compute_average_views()
TitleParser.gen_compute_average_views(TitleParser.description_dict,
                                      'DescriptionData.csv')
TitleParser.gen_compute_average_views(TitleParser.tag_dict, 'TagData.csv')
Example #6
0
                row['viewCount'] = '10,001-100,000'
            elif row['viewCount'] <= 1000000:
                row['viewCount'] = '100,001-1,000,000'
            else:
                row['viewCount'] = '1,000,000+'
            data.append(row)

    with open("Fixed"+readFile,'w',encoding='utf8',newline='') as output_file:
        keys = data[0].keys()
        writer = csv.DictWriter(output_file, keys)
        writer.writeheader()
        writer.writerows(data)

def fixNullByte():
    readFile = 'YouTubeData5.csv'
    data = ''

    with open(readFile) as fd:
        data = fd.read()

    with open("NoNull"+readFile, 'w') as fo:
        fo.write(data.replace('\x00', ''))

#fixNullByte()
#fixViewCount()
s = grabYouTubeSample()
TitleParser.parse_videos(s)
TitleParser.compute_average_views()
TitleParser.gen_compute_average_views(TitleParser.description_dict, 'DescriptionData.csv')
TitleParser.gen_compute_average_views(TitleParser.tag_dict, 'TagData.csv')