コード例 #1
0
ファイル: main.py プロジェクト: tmoi29/AliceCLS
def get_text_links():
    for entry in links:
        arr = links[entry]
        if "//" not in arr[0]:
            i = 0
            while i + 2 < len(arr):
                try:
                    title = scraper.scrape(arr[i + 2],
                                           arr[1])[0].strip().encode(
                                               'ascii', 'ignore') + " " + entry
                except:
                    print arr
                    print i + 2

                data.loc[title] = ["", 0]
                data.loc[title]["Link"] = arr[i + 2]
                txts = scraper.scrape(arr[i + 2], arr[0])

                for txt in txts:
                    words = txt.split()
                    for w in words:
                        w = ''.join(c for c in w if c not in punctuation)
                        if w.lower() in keywords:
                            data.loc[title]["Score"] += 1
                i += 1
        else:
            print entry
コード例 #2
0
ファイル: main.py プロジェクト: tmoi29/AliceCLS
def get_text_files():
    paths = glob.glob('files/*/*')
    for f in paths:
        source = f.split('/')[1]
        title = scraper.scrape_file(f, links[source][1])[0].strip().encode(
            'ascii', 'ignore') + ' ' + source
        data.loc[title] = ["", 0]
        for s in ["lexology", "natlawreview"]:
            if source == s:
                for link in links[s]:
                    if "//" in link:
                        t = scraper.scrape(link,
                                           links[s][1])[0].strip().encode(
                                               'ascii', 'ignore')
                        if t[:10].lower() in title.lower():
                            data.loc[title]['Link'] = link
                            break
        if source == "law360":
            for link in links["law360"]:
                if "//" in link:
                    t = scraper.scrape(link, "h1")[0].strip().encode(
                        'ascii', 'ignore')
                    if t[:10].lower() in title.lower():
                        data.loc[title]['Link'] = link
                        break

        txts = scraper.scrape_file(f, links[source][0])
        for txt in txts:
            words = txt.split()
            for w in words:
                w = ''.join(c for c in w if c not in punctuation)
                if w.lower() in keywords:
                    data.loc[title]["Score"] += 1
コード例 #3
0
def scrape_nnm():
	settings = player.load_settings()
	data_path = settings.torrents_path()

	hashes = []
	for torr in filesystem.listdir(filesystem.join(data_path, 'nnmclub')):
		if torr.endswith('.torrent'):
			try:
				from base import TorrentPlayer
				tp = TorrentPlayer()
				tp.AddTorrent(filesystem.join(data_path, 'nnmclub', torr))
				data = tp.GetLastTorrentData()
				if data:
					hashes.append((data['announce'], data['info_hash'], torr.replace('.torrent', '.stat')))
			except BaseException as e:
				log.print_tb(e)

	for chunk in chunks(hashes, 32):
		import scraper
		try:
			seeds_peers = scraper.scrape(chunk[0][0], [i[1] for i in chunk])
		except RuntimeError as RunE:
			if '414 status code returned' in RunE.message:
				for c in chunks(chunk, 16):
					try:
						seeds_peers = scraper.scrape(c[0][0], [i[1] for i in c])
						process_chunk(c, data_path, seeds_peers)
					except BaseException as e:
						log.print_tb(e)
			continue
		except BaseException as e:
			log.print_tb(e)
			continue

		process_chunk(chunk, data_path, seeds_peers)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(description='Bulk Saving Of Job Posts')
    parser.add_argument('-q', help='Search Term Query')
    parser.add_argument('-l', help='Location')
    parser.add_argument('-o', help='Output location')
    parser.add_argument('--json', help='Export Json File')
    parser.add_argument('--xlsx', help='Export Excel File')

    args = parser.parse_args()

    # print(args.accumulate(args.integers))
    if args.l == None:
        print "Must include location with: -l 'location'"
        exit
    if args.q == None:
        print "Must include search term with: -q 'job title'"
        exit

    if (args.q != None and args.l != None):
        print 'make this stuff happen'
        output = ''
        if (args.o == None):
            fileName = args.l.replace(' ', '-').replace(
                ',', '') + '-' + args.q.replace(' ', '-')
            output = os.getcwd() + '/' + fileName + '.xlsx'
        else:
            output = args.o
        scraper.scrape({
            'location': args.l,
            'search': args.q,
            'output': output
        })
コード例 #5
0
def concertScrape(limitOfNew):
    key = 'Dv4TzTBMtO5GJ57Dcrf0Jbxst8fEQHLx'
    secret = 'lgAN5MVcjmUM30rB'
    url = 'https://app.ticketmaster.com/discovery/v2/events.json?size=' + str(limitOfNew) + '&classificationName=concert&apikey=Dv4TzTBMtO5GJ57Dcrf0Jbxst8fEQHLx'
    
    contents = urllib2.urlopen(url).read()
    data = json.loads(contents)
    
    for i in range(len(data["_embedded"]["events"])):
        try:
            event = data["_embedded"]["events"][i]
            
            # Get all necessary info from json
            creator = event["promoter"]["name"]
            date = event["dates"]["start"]["localDate"]
            name = event["name"]
            url = event["images"][0]["url"]
            desc = event["url"]
            
            # Change from unicode str
            creator = scraper.uni_to_str(creator)
            date = scraper.uni_to_str(date)
            name = scraper.uni_to_str(name)
            url= scraper.uni_to_str(url)
            desc = scraper.uni_to_str(desc)
            
            print(creator, date, name, url, desc)
                    
            scraper.scrape(creator, name, desc, date, "concert", url)
        except:
            continue
コード例 #6
0
def main():
    creds = config.get_creds()
    sftp.download(creds.get("sftp_url"), creds.get("sftp_username"),
                  creds.get("sftp_password"), creds.get("localpath"))
    cleaner.clean(creds.get("localpath"))
    merge.merge(creds.get("localpath"))
    scraper.scrape(creds)
コード例 #7
0
def main():
  if len(sys.argv) <= 1:
    printUsageAndExit()

  animes = sys.argv[1:]
  anime_urls = [utils.getPageUrl(anime) for anime in animes]
  s.scrape(anime_urls)
コード例 #8
0
def predict(pages, team1, team2, m):
    #make sure we have data on these teams
    if not(isTop20(team1)):
        print("Team 1 is not a top 20 team")
        return
    if not(isTop20(team2)):
        print("Team 2 is not a top 20 team")
        return 
    if not(isMap(m)):
        print("The map is not in our pool")
        return
    if pages>0:
        print("Scraping process will take some time, please be patient")
        print("**********Scraping Map Results now**********")
        scrape(pages)
        print("**********Scraping Map Stats now**********")
        scrape_map_stats()
        print("**********Filtering Data now**********")
        filterCSV()
        print("**********Generating Data now**********")
    #Make sure we have a "filtered_top20.csv" file to examine (in case the user doesn't scrape)
    if os.path.isfile('filtered_top20.csv'):
        data = getDataReady()
        tree = build_tree_id3(data)
        #pp = pprint.PrettyPrinter(indent=4)
        #pp.pprint(tree)
        boolean = {True : team1, False : team2}
        print("{} would win.".format(boolean[classify(tree,userInputStats(team1, team2, m))]))  
    else:
        print('\"filtered_top20.csv\" was not found. Please scrape for data before attempting to predict')
コード例 #9
0
def test_scrape_raises_http_error():
    with mock.patch('scraper.urlopen') as urlopen_mock:
        urlopen_mock.side_effect = HTTPError('http://example.org', 404,
                                             'Not found', {}, mock.Mock())
        with pytest.raises(HTTPError) as exc:
            scraper.scrape('http://example.org')
        assert exc.value.code == 404
        assert exc.value.msg == 'Not found'
コード例 #10
0
def lambda_handler(event, context):

    # Run scraping function
    scrape()

    # Query Spotify API, write into JSON file and upload it to S3
    query_spotify_api()

    print("The function ran successfully.")
コード例 #11
0
ファイル: main.py プロジェクト: sudiptog81/lhd-covid-scraper
def home():
    try:
        scraper.scrape()
        return 'Ran Successfully'
    except Exception as e:
        if (app.config['DEBUG'] == True):
            return str(e)
        else:
            return 'Error Encountered'
コード例 #12
0
ファイル: app.py プロジェクト: jdm79/headline-api
def print_headlines():
  # clear the list so we only get the latest headlines
  headlines.clear()

  # run this function over each paper
  for url in urls.values():
    scrape(url)
    
  # the function returns the final list
  return response
コード例 #13
0
ファイル: profilefetcher.py プロジェクト: mr4jay/LinkedIngine
def fetchProfiles(initURL, maxcount):
    """Given the URL from where to initiate the crawling, it first fetches the webpage, sends it to
    the crawler for scraping data from the webpage. Not only that, it also reads all the public profile
    urls present in the current page and adds them to the list. In subsequent iterations, it will fetch
    the LinkedIn profiles of people associated with these urls. The iteration continues for the number of
    times specified by maxcount"""
    count = 0
    links = set([initURL])
    waitinglist = list()

    start = datetime.now()

    while count < maxcount:
        count += 1

        while len(links) > 0:
            newreq = links.pop()
            if newreq not in waitinglist:  # If the url hasn't be used already, add it to the waiting list
                waitinglist.append(newreq)
                break

        try:
            page = urllib2.urlopen(waitinglist[-1]).read(
            )  # Fetch the web page from the url just appended
            scraper.scrape(
                page,
                waitinglist[-1])  # Send the page and the url for scraping

            if len(links) < 3:
                links.update(profileURL.findall(
                    page))  # Get all the urls present in this web page
        except:
            pass

        links = set([link.strip('"')
                     for link in links])  # String processing to remove quotes

        percentage = int(count * 100.0 / maxcount)  # Progress bar
        sys.stdout.write('\r' + '=' * percentage + '>' + ' ' *
                         (101 - percentage) + str(percentage) + '%')
        sys.stdout.flush()

    print 'Fetched', count, 'profiles in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    start = datetime.now()
    classifier.classify(
    )  # Classify all profiles in the database [TODO: classify only updated portion of db]
    print 'Classified all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    indexer.computeIndexes(
    )  # Compute indexes for every profile in the database [TODO: same as above]
    print 'Calculated indexes for all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'
コード例 #14
0
ファイル: main.py プロジェクト: KC2004/bidwire
def main():
    log.info("Starting Bidwire run")
    start = time.time()

    scraper.scrape()
    log.info("Scraping complete. Sending notifications.")
    new_bids = notifier.send_new_bids_notifications(EMAIL_RECIPIENTS)
    elapsed_secs = time.time() - start

    log.info("Notification sending complete. Sending debug email.")
    DebugEmail().send(new_bids, EMAIL_RECIPIENTS, elapsed_secs)
コード例 #15
0
def main():
    log.info("Starting Bidwire run")
    start = time.time()

    scraper.scrape(SITE_CONFIG)
    log.info("Scraping complete. Sending notifications.")
    new_bids = notifier.send_new_notifications(SITE_CONFIG)
    elapsed_secs = time.time() - start

    log.info("Notification sending complete. Sending debug email.")
    DebugEmail().send(new_bids, SITE_CONFIG, elapsed_secs)
コード例 #16
0
ファイル: companies.py プロジェクト: nickwu241/glass-heaven
def scrape_companies_data(
    company_names: List[str],
    use_cache: bool = False,
    n: int = 2147483647,
    skip_companies: Set[str] = set()
) -> Tuple[List[Company], List[FailedCompanyError]]:
    errors = []
    output_data = []

    for i, company_name in enumerate(company_names):
        if i >= n:
            break

        if company_name in skip_companies:
            print(f'[INFO] Skip scraping {company_name}')
            continue

        try:
            company_id = company_name.replace(' ', '_').lower()
            company = Company(id=company_id)
            overview_url, reviews_url = scraper.get_glassdoor_urls(
                company_name)
            print('[INFO]', company_name, overview_url, reviews_url)
            if overview_url is None or reviews_url is None:
                raise Exception(
                    f'Cannot find both URLs for "{company_name}": {overview_url} {reviews_url}'
                )

            reviews_data = scraper.scrape(reviews_url,
                                          f'{company_name}_reviews.html',
                                          scraper.get_reviews_data)
            overview_data = scraper.scrape(overview_url,
                                           f'{company_name}_overview.html',
                                           scraper.get_overview_data)
            data = {
                'name': company_name,
                'overview_url': overview_url,
                'reviews_url': reviews_url,
                'linkedin_url': scraper.get_linkedin_url(company_name),
            }
            data.update(reviews_data)
            data.update(overview_data)
            company.update_data(data)
            output_data.append(company)
        except Exception as e:
            print(f'[FAIL] caught exception when parsing "{company_name}"')
            errors.append(
                FailedCompanyError(
                    company_name=company_name,
                    exception=e,
                ))

    return output_data, errors
コード例 #17
0
 def get(self, college, year, branch, low, high, semc):
     scraper.scrape(college, year, branch, low, high, semc)
     filename = 'ExcelFiles/' + '1' + college + year + branch + low + '-' + high
     extension = '.xls'
     zipf = zipfile.ZipFile('Results-Excel.zip', 'w', zipfile.ZIP_DEFLATED)
     files = [
         filename + extension, filename + 'GPA' + extension,
         filename + 'RANK' + extension
     ]
     for file in files:
         zipf.write(file)
     zipf.close()
     return send_from_directory('', 'Results-Excel.zip')
コード例 #18
0
def start(parameters):

    try:
        db_connection = connector.connect(user='******',
                                          password='******',
                                          host='localhost',
                                          database='EAGLEEYE')

        scraper.scrape(parameters, db_connection)
    except:

        time.sleep(5)
        print("Database down, trying to connect...")
        start(parameters)
コード例 #19
0
 def test_scrape(self, source):
     test_limit = 3
     web_df = scrape(
         source=source,
         limit=test_limit,
         test=True,
         since=str(datetime.datetime.now().date() - datetime.timedelta(7)),
     )
     self.assertEqual(len(web_df), test_limit)
     web_df = scrape(source=source,
                     limit=test_limit,
                     test=True,
                     since="2019-09-17")
     self.assertEqual(len(web_df), test_limit)
コード例 #20
0
ファイル: model.py プロジェクト: declankirk/breadtubebot2
def update_model():
    print("Scraping posts...")
    scrape(500000)

    print("Building model...")
    with open('titles.txt', encoding='utf8') as f:
        titles = f.read()
    model = markovify.NewlineText(titles)

    print("Exporting model...")
    model_json = model.to_json()
    with open('model.json', 'w') as f:
        f.write(model_json)
    
    print("Done!")
コード例 #21
0
def read_majors(
        game_id=int(db_game), year=int(db_year), base=None, current=False):
    set_readin_args(args)
    #slugs = ["genesis-5","summit6","shine2018","tbh8","summit7"]
    fails = []
    scrape_load = False
    slug_given = False
    if db_slug == None:
        if to_load_slugs:
            scrape_load = True
            if v >= 3 and year == int(db_year):
                print('Loading saved slugs...')
            slugs = load_slugs(game_id, year)
            if slugs == False or slugs == []:
                if v >= 3:
                    print('Saved slugs not found.')
                slugs = scraper.scrape(game_id, year, v)
                scrape_load = False
        else:
            slugs = scraper.scrape(game_id, year, v)
        fails = [event[1] for event in slugs if type(event) is tuple]
        slugs = [event for event in slugs if type(event) is str]
    elif type(db_slug) is list:
        slugs = db_slug
        slug_given = True
    else:
        #print(type(db_slug))
        slugs = [db_slug]
        slug_given = True
    if v >= 3 and not scrape_load and not slug_given:
        if len(slugs) <= 0:
            print('No slugs found for game %d in year %d:' % (game_id, year))
        else:
            print('Scraped the following slugs for game %d in year %d:' %
                  (game_id, year))
            print(slugs)
    if not fails == [] and v > 0:
        print(
            'The following majors could not be read (no smash.gg bracket found)'
        )
        print(fails)
    if to_save_db and not scrape_load and not slug_given:
        save_slugs(slugs, game_id, year, to_save_db=to_save_db)
    return (read_tourneys(slugs,
                          ver=game_id,
                          year=year,
                          base=base,
                          current=current))
コード例 #22
0
ファイル: endpoints.py プロジェクト: Sciguystfm/Dining-Data
def handle_scrape():
    username = request.authorization.username
    password = request.authorization.password
    sdate = request.args.get("sdate")
    edate = request.args.get("edate")
    try:
        scrape(username, password, sdate, edate)
    except NoSuchElementException as e:
        return jsonify({
            "status": "failure"
        }),400


    return jsonify({
        "status": "success"
    }), 200
コード例 #23
0
ファイル: renamer.py プロジェクト: wtcurtis/TVFilter
def renameAll(rootDir, metadataFile):
	try:
		season = os.path.basename(rootDir)
		metadata = scraper.scrape(open(metadataFile))
		metadata
	#	print metadata
	#	sys.exit()
	
	except ValueError:
		print "Couldn't parse the season from the given."
		return 0
	
	files = os.listdir(rootDir)

	parsedFiles = []
	pattern = re.compile(r"([\w ]*) - (\d\d)x(\d\d)(.*)")

	for f in files:
		matches = pattern.search(f).groups()
		orgFile = os.path.join(rootDir, f)
		show = matches[0]
		season = int(matches[1])
		episode = int(matches[2])

		newName = '{0} - {1:0>2}x{2:0>2} - {3}{4}'.format(show, season, episode, metadata[season][episode], matches[3])
		
		result = raw_input('Moving {0} to {1}. Continue? (y/n)'.format(f, newName))
		if(result == 'y'):
			os.rename(orgFile, os.path.join(rootDir, newName))
		parsedFiles.append({'file': orgFile, 'newPath': os.path.join(rootDir, newName)})
	
	return parsedFiles
コード例 #24
0
ファイル: default.py プロジェクト: tuffnerdstuff/xbmc-plugins
def buildVideoIndex(url):
        data=scraper.scrape(url)
	nextLinkUrl=scraper.scrapeNextPageLink(url)
        for name,info_url,img,date in data:
                addLink(name,info_url,3,img)
	if (nextLinkUrl != None):
		addDir("[Next Page >>]",nextLinkUrl,2,'')
コード例 #25
0
def search_scene():
    global search, running, show_books
    while search:
        # Ovoa e search scene, tuka e input box i unasa ime na knigata
        screen.fill(GRAY)
        search_box.update()
        search_box.draw(screen)
        pygame.display.flip()
        for ev in pygame.event.get():
            if ev.type == pygame.QUIT:
                # Tuka treba som izleze od search scene da izgase celoto ama nekje taka nesto
                running = False
                search = False
            # Od klasata, proveruva dali si kliknal na s_box, enter, backspace i so tekst upisuvas
            search_box.handle_event(ev)
            if search_box.enter:
                search_book = search_box.rText
                show_books = True
                search = False
                for book_dict in scraper.scrape(search_book):
                    # Od scrapero stava rezultatite u 2 arrays
                    titles.append(book_dict["book"])
                    authors.append(book_dict["author"])
                    book_ids.append(book_dict["book_id"])
                    downloads.append(book_dict["downloads"])
    return images, titles
コード例 #26
0
def scraper():
    #delete_all_potential_locations()
    loopnetListings = scrape()
    j = 0
    name_set = {}
    for index, row in loopnetListings.iterrows():
        address_map = {}
        address = row[0]
        address_map['address'] = address
        components = address.split(",")
        try:
            address_map['street'] = components[0]
            address_map['city'] = components[1]
            address_map['state'] = components[2]
        except:
            print("Exception: invalid format of address")
            continue
        name = row[1]
        if name_set.get(name) == None:
            name_set[name] = 1
        else:
            name = name + " " + str(name_set.get(name))
        lat, lon = get_lat_long(address)
        try:
            store_scraped_in_google(address_map, name, lat, lon)
        except:
            print("Exception: Could not store in Google")
コード例 #27
0
 def test_english_detection(self):
     from translation import Translator
     my_translator = Translator(None)
     result = scraper.scrape("http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNFY1KzEAhaiZchzd5ulmoY4_4P8kA&url=http://vov.vn/Van-hoa/NSND-Thanh-Hoa-xuc-dong-hat-truoc-benh-nhan/228256.vov")
     self.assertFalse(result.get('unscrapable'))
     text_obj = process_resources.extract_clean_content(result['htmlContent'])
     self.assertFalse(my_translator.is_english(text_obj['content']))
コード例 #28
0
 def scrape_thread(cur2, pbar, count, qhashs, nth, total, ip="open.demonii.com"):
     db = MySQLdb.connect(**config.mysql)
     cur = db.cursor()
     last_commit = time.time()
     errno=0
     try:
         l = qhashs.get(timeout=0)
         while True:
             try:
                 for hash, info in scraper.scrape("udp://%s:1337/announce" % ip, l).items():
                     cur.execute("UPDATE torrents SET scrape_date=NOW(), seeders=%s, leechers=%s, downloads_count=%s WHERE hash=%s", (info['seeds'], info['peers'], info['complete'], hash))
                 if time.time() - last_commit > 30:
                     db.commit()
                     last_commit = time.time()
                 pbar.update(min(pbar.currval + len(l), count))
                 l = qhashs.get(timeout=0)
                 errno=0
             except (socket.timeout, socket.gaierror, socket.error):
                 db.commit()
                 time.sleep(0.1 * errno + 0.1)
                 errno+=1
                 if errno > 10:
                     raise
     except queue.Empty:
         pass
     finally:
         db.commit()
         cur.close()
         db.close()
コード例 #29
0
ファイル: app.py プロジェクト: wi11/Intern.find
def display_tables():
    max_results_per_state = 10
    state_set = ["California", "New York", "Washington", "Illinois", "Texas"]
    data = {"Job Title":[], "Company":[], "Location":[]}
    dataframe = scraper.scrape(max_results_per_state, state_set, data)
    print "HI"
    return render_template("table.html", dataframe=dataframe.to_html())
コード例 #30
0
def get_torrent_stats(url):
    response = requests.get(url)
    data = bencodepy.decode(response.content)
    files = data[b'info'][b'files']
    size = 0
    for file in files:
        size += file[b'length']
    size = size * 1e-9
    size = round(size, 2)
    info_hash = hashlib.sha1(bencodepy.bencode(data[b"info"])).hexdigest()

    trackers_list = data[b'announce-list']

    stats = {'seeds': 0, 'peers': 0}

    for tracker_url in trackers_list:
        tracker_url = tracker_url[0].decode('utf-8')
        result = scrape(tracker_url, [info_hash])
        if not result:
            continue
        if result[info_hash]['seeds'] is None or result[info_hash][
                'peers'] is None:
            continue
        stats['seeds'] = max(stats['seeds'], result[info_hash]['seeds'])
        stats['peers'] = max(stats['peers'], result[info_hash]['peers'])
    stats['size_gb'] = size
    return stats
コード例 #31
0
ファイル: app.py プロジェクト: aphrx/jabber
def scheduled():

    # print out time that cron job was deployed in console
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print("Cron Job Executed at " + dt_string)

    # Find users who signed up for cron job
    users = mongo.db.users.find({'cron': {"$exists": True}})

    # For each user who has signed up for cron job, find jobs they are interested ibn
    for u in users:
        test = scraper.scrape()
        jobs = []
        employer = []
        links = []
        jobs, employer, links, count = test.search(u['cron']['cron_job'],
                                                   u['cron']['cron_loc'], True)
        j = jobbankapply.apply(links)
        emails, jobs, employer = j.run()

        # find user's cv and resume
        cv_data = u['cv']
        cv_data = cv_data.encode('latin-1', 'replace').decode('latin-1')
        resume = u['resume']
        resume = resume.encode('latin-1', 'replace').decode('latin-1')

        # apply for jobs on behalf of user
        j.email(emails, jobs, employer, cv_data, resume, u['id'], u['email'])
コード例 #32
0
ファイル: web2epub.py プロジェクト: gauravmm/Web2ePub
def run(args):
	styler = getStyle(args.style[0]);
	if not styler:
		raise RuntimeError("Cannot find output style " + args.style + ".");
	print "Loaded Style: \t" + styler.name + ".";

	print "Scraping Website";
	web = scrape(args.url, styler, cache=(not args.no_cache));

	if len(web[1]) == 0:
		raise RuntimeError("No valid pages found!");

	web = (styler.edit_book_metadata(web[0]), web[1], web[2]);
	
	fout = title_fn(web[0]["author"] + " - " + web[0]["title"]);
	if args.out and len(args.out) > 0 and len(args.out[0]) > 0:
		# If args.out is a directory, then we use our custom filename, otherwise
		# we use the given filename.
		if args.out[0][-1] == "/":
			fout = args.out[0] + fout;
		else:
			fout = args.out[0];

		if not fout.lower().endswith(".epub"):
			fout = fout + ".epub";
	
	if args.no_overwrite and path.exists(fout):
		print "Skipped! \"" + fout + "\" exists.";
	else:
		print "Building ePub";
		epub(web, fout, styler, args);
		
		print "Done! Written output to \"" + fout + "\"";
コード例 #33
0
ファイル: ml.py プロジェクト: eoershova/vk-sentiment
def async_handler():
    conn = sqlite3.connect('test2.db', check_same_thread=False)
    c = conn.cursor()
    while True:
        c.execute(
            "SELECT token, period, request_id FROM test WHERE score = 'unready'"
        )
        request = c.fetchone()

        if request is not None:
            print('ПРИНИМАЮСЬ')
            request_id = request[2]
            print(type(request_id))
            date = request[1]
            token = request[0]
            dates = period(date.split(" ")[0], date.split(" ")[1])
            posts = scrape(dates, token)
            comments = get_comments(token, posts, dates, request_id)
            result = predict(comments)
            predictions_df = result[1]
            c.execute("UPDATE test SET score = ? where request_id = ?",
                      (result[0], request_id))
            c.execute("UPDATE test SET comments = ? where request_id = ?",
                      (predictions_df, request_id))
            conn.commit()
            print('УПРАВИЛСЯ')
            continue
        time.sleep(0.5)
コード例 #34
0
def mooving(ticker):

    """
This function retrieves the dataframe created by the scrape function.
It runs mooving_average on it to predict tomorrows prices.

    """
    df = scrape(ticker)
    df_cut = pd.DataFrame(
        df, columns=['date', 'closing', 'SMA', 'EMA_Short', 'EMA_Long'])
    df_cut.closing = np.around(df_cut.closing, decimals=2)

    window = round(len(df_cut)*0.2)

    #Simple Moving Average

    preds=[]
    for i in range(window):
        x = df_cut.closing[(len(df_cut) - 2*window + i):(len(df_cut) - window + 1)].sum() + sum(preds)
        x_mean = x/window
        preds.append(np.around(x_mean, decimals=2))
        df_cut['SMA'][len(df_cut) - window + i] = preds[i]

    rms_sma=np.sqrt(mean_squared_error(np.array(df_cut['closing'][(len(df_cut) - window) :]), np.array(df_cut['SMA'][(len(df_cut) - window) :])))

    #Tomorrow's predicted price
    num=0
    denom=0
    for j in range(window):
        num+= j*(df_cut.closing[len(df_cut) - window + j])
        denom+= j
    pred_weighted = num/denom

    print(df_cut)
    return pred_weighted
コード例 #35
0
def all():
    a = scrape()
    return jsonify({
        "head": "from covid-19-generator",
        "status": 200,
        "body": a
    })
コード例 #36
0
ファイル: profilefetcher.py プロジェクト: hjfu/LinkedIngine
def fetchProfiles(initURL, maxcount):
    """Given the URL from where to initiate the crawling, it first fetches the webpage, sends it to
    the crawler for scraping data from the webpage. Not only that, it also reads all the public profile
    urls present in the current page and adds them to the list. In subsequent iterations, it will fetch
    the LinkedIn profiles of people associated with these urls. The iteration continues for the number of
    times specified by maxcount"""
    count = 0
    links = set([initURL])
    waitinglist = list()

    start = datetime.now()

    while count< maxcount:
        count += 1

        while len(links) > 0:
            newreq = links.pop()
            if newreq not in waitinglist:   # If the url hasn't be used already, add it to the waiting list
                waitinglist.append(newreq)
                break

        try:
            page = urllib2.urlopen(waitinglist[-1]).read() # Fetch the web page from the url just appended
            scraper.scrape(page, waitinglist[-1]) # Send the page and the url for scraping

            if len(links) < 3:
                links.update(profileURL.findall(page)) # Get all the urls present in this web page
        except:
            pass

        links = set([link.strip('"') for link in links]) # String processing to remove quotes

        percentage = int(count*100.0/maxcount)    # Progress bar
        sys.stdout.write('\r'+'='*percentage+'>'+' '*(101-percentage) +str(percentage)+'%')
        sys.stdout.flush()

    print 'Fetched', count, 'profiles in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    start = datetime.now()
    classifier.classify() # Classify all profiles in the database [TODO: classify only updated portion of db]
    print 'Classified all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    indexer.computeIndexes() # Compute indexes for every profile in the database [TODO: same as above]
    print 'Calculated indexes for all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'
コード例 #37
0
ファイル: nextride.py プロジェクト: DrashtiBhatt/NextRide
def mobile():
    stop = request.args.get("stop", 1, type=int)
    schedule = scrape(stop)
    if schedule:
        response = dict(meta=dict(status=200, message="OK"), data=schedule)
    else:
        abort(400)
    return render_template("m.html", path=response)
コード例 #38
0
ファイル: nextride.py プロジェクト: DrashtiBhatt/NextRide
def api():
    stop = request.args.get("stop", 1, type=int)
    schedule = scrape(stop)
    if schedule:
        response = jsonify(meta=dict(status=200, message="OK"), data=schedule)
    else:
        abort(400)
    return response
コード例 #39
0
ファイル: testscraper.py プロジェクト: narenaryan/Sticker
def second_scrape():
    fir_url_box=list(first_scrape())
    print "i am in second"
    sec_url_box=[]
    for ele in fir_url_box:
        print ele
        for item in scrape(ele,'//a[@class="img"]/@href'):
            yield item
コード例 #40
0
ファイル: api.py プロジェクト: kshvmdn/cdn-university-api
def api(code):
    try:
        data = scrape(code)
    except:
        data = None

    if not data:
        abort(404, {'message': 'Couldn\'t retrieve data for program %s.' % code})

    return jsonify(meta=dict(status=200, message='OK'), data=data)
コード例 #41
0
ファイル: bordertimes.py プロジェクト: kshvmdn/border-times
def single(port):
    try:
        wait_times = scrape(port)
    except KeyError:
        abort(404, {'message': 'Invalid `port` value.'})

    if not wait_times:
        abort(400)

    return jsonify(meta=dict(status=200, message='OK'), data=wait_times)
コード例 #42
0
ファイル: feeds.py プロジェクト: code56/jats-scraper
def scrape(docs_dir, process=None, article_version=None):
    if docs_dir is not None:
        import scraper
        mod = __import__(__name__)
        res = scraper.scrape(mod, doc=docs_dir, article_version=article_version)
        if process:
            res = process(res)

        import json
        res = json.dumps(res, indent=4, ensure_ascii = False)
        return res.encode('utf8')
コード例 #43
0
ファイル: main.py プロジェクト: Manuel87/textual-tools
def web_count(name, levels, out=None):
    folder = scrape(name, levels, out)
    file_list = folder_reader(folder)
    count = Counter()
    count.name = folder
    count.source = name
    if file_list == None:
        return
    else:
        for file in file_list:
            count.count(folder + "/" + file)
    if out == None:
        out = os.getcwd()
    writer(out + "/report.JSON", json_formulate(count))
コード例 #44
0
ファイル: db.py プロジェクト: barrylavides/New-Movies
def insert_movie():
    url = 'http://www.imdb.com/movies-coming-soon'
    url_exist = mongo.db.movies.find_one({'url': url})
    action = ''

    if url_exist == None:
        movies = mongo.db.movies.insert_many(scraper.scrape(url))
        action = 'Scrape movies and add to database'
    else:
        pass
        # Get movies from database
        action = 'Get movies from database'

    return action
コード例 #45
0
ファイル: app.py プロジェクト: Nyubis/kul-calendar
def lookup(coursestring):
    courses = coursestring.split("+")
    entries_by_day = {}
    for course in courses:
        # check whether the user provided a valid day, otherwise use today
        day_param = request.args.get('day')
        if day_param is not None and verify_date_param(day_param):
            coursedata = scraper.scrape(course, day_param)
        else:
            coursedata = scraper.scrape(course)
        # this data is from one particular course
        # we take the individual moments and put them in the dict, separated by day
        bucketadd(entries_by_day, "day", coursedata)

    # this will end up containing Weekday objects, which also contain the courses for that day
    weekdays_with_courses = [] 
    for date, entries in entries_by_day.items():
        weekdays_with_courses.append(Weekday(date, entries))

    # sort the Weekdays based on their weekindex, so monday comes first and sunday last
    sorted_data = sorted(weekdays_with_courses, key=lambda x: x.weekindex)
    print(sorted_data)
    return render_template("lookup.html", days=sorted_data)
コード例 #46
0
def scrape_now(fn):
	debug(fn)
	tp = TorrentPlayer()
	tp.AddTorrent(fn)
	data = tp.GetLastTorrentData()
	debug(str(data))
	if data:
		hashes = [data['info_hash']]
		import scraper

		res = scraper.scrape(data['announce'], hashes)
		debug(str(res))
		return res[data['info_hash']]
	else:
		return {}
コード例 #47
0
ファイル: feed.py プロジェクト: henrytrager/openbay-crawler
 def scrape_thread(cur2, pbar, count, qhashs, nth, total, ips=["open.demonii.com"]):
     db = MySQLdb.connect(**config.mysql)
     cur = db.cursor()
     last_commit = time.time()
     errno=0
     nip = len(ips)
     banip=collections.defaultdict(int)
     i = nth
     try:
         l = qhashs.get(timeout=0)
         while True:
             try:
                 ip = ips[i%len(ips)]
                 i+=1
                 for hash, info in scraper.scrape("udp://%s:1337/announce" % ip, l).items():
                     cur.execute("UPDATE torrents SET scrape_date=NOW(), seeders=%s, leechers=%s, downloads_count=%s WHERE hash=%s", (info['seeds'], info['peers'], info['complete'], hash))
                 if time.time() - last_commit > 30:
                     db.commit()
                     last_commit = time.time()
                 pbar.update(min(pbar.currval + len(l), count))
                 l = qhashs.get(timeout=0)
                 errno=0
             except (socket.timeout, socket.gaierror, socket.error):
                 db.commit()
                 banip[ip]+=1
                 if banip[ip]>3:
                     try:ips.remove(ip)
                     except ValueError:
                         pass
                 if not ips:
                     raise ValueError("all ips failed")
                 time.sleep(0.1 * errno + 0.1)
                 errno+=1
                 if errno > nip*3:
                     raise
     except (queue.Empty, ZeroDivisionError):
         pass
     except (socket.timeout, socket.gaierror, socket.error):
         qhashs.put(l)
     except (ValueError, RuntimeError) as e:
         print e
     finally:
         db.commit()
         cur.close()
         db.close()
コード例 #48
0
ファイル: views.py プロジェクト: bgheneti/PythonPictoscrape
def createURL(request):
	latest_fanfic_list = FanFic.objects.all().order_by('-pub_date')[:11]

	if request.method == 'POST': 
		form = CreateURLForm(request.POST)
		if form.is_valid():
			new_fanfic = form.save(commit=False)
			#banti's code -- scraping from url
			d = scraper.scrape(form.cleaned_data['url'])
			new_fanfic.title = d['title']
			new_fanfic.author = d['author']
			new_fanfic.text = d['text']
			new_fanfic.fandom = d['fandom']
			if d['text'] == '':
				new_fanfic.text = d['summary']
			print "text of fanfic: " + new_fanfic.text
			#alyssa's code -- getting keywords
			kwlist = my_immortal_keyword_finder.getwords(new_fanfic.text)
			try:
				new_fanfic.profile=str(image_return.googlePrep(d['fandom']))
			except:
				print "F**K the profile picture"
			new_fanfic.save()
			for kw in kwlist:
				kw = kw.strip()
				#banti's code -- getting image urls 
				try:
					new_fanfic.keyword_set.create(key_word=kw, image_url=str(image_return.googlePrep(kw)))
				except:
					print kw + "is f****d"
			return HttpResponseRedirect('/fanfics/'+ str(new_fanfic.id)) # Redirect after POST
	else:
		form = CreateURLForm() # An unbound form

	return render(request, 'fanfics/createURL.html', {
        'form': form,
        'latest_fanfic_list':latest_fanfic_list
    })
コード例 #49
0
ファイル: testscraper.py プロジェクト: narenaryan/Sticker
def third_scrape():
    print "i am in third"
    third_url_box=[]
    for ele in second_scrape():
        for item in scrape(ele,'//a[contains(@href,"images")]/@href'):
            yield item
コード例 #50
0
ファイル: main.py プロジェクト: amsully/mapmyrun_webscraper
import csv,scraper
with open('Alexs_workout_history.csv') as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
		scraper.scrape(row[14])
		
コード例 #51
0
ファイル: main.py プロジェクト: undercase/scheduler
"""
Main Execution File for Scheduler
"""

from scraper import scrape
from ical import make_calendar

if __name__ == "__main__":
    make_calendar(scrape())
    print("Your calendar has been saved to this directory as 'UNT_schedule.ics'.")
コード例 #52
0
 def call(self):
     generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0]
     expected_eif = json.load(open(eif_file))
     self.assertEqual(byteify(expected_eif), byteify(generated_eif))
コード例 #53
0
def inject_methods():
    this_dir = os.path.abspath(os.path.dirname(__file__))
    source_xml_dir = join(this_dir, 'JATS')
    source_eif_dir = join(this_dir, 'EIF')
    source_partial_dir = join(this_dir, 'EIF', 'partial')

    # returns a map of {fname: /path/to/fname, ...} for given `dir`
    def path_map(parent):
        paths = map(lambda fname: join(parent, fname), os.listdir(parent))
        paths = filter(os.path.isfile, paths)
        return dict(zip(map(os.path.basename, paths), paths))

    # creates absolute paths to the EIF fixtures
    xml_path_list = path_map(source_xml_dir)
    eif_path_list = path_map(source_eif_dir)
    partial_eif_path_list = path_map(source_partial_dir)

    def xml_fname_to_eif(xml_fname, xml_path):
        return join(source_eif_dir, os.path.splitext(xml_fname)[0] + ".json")

    for xml_file, xml_path in xml_path_list.items():
        eif_file = xml_fname_to_eif(xml_file, xml_path)

        if not os.path.exists(eif_file):
            LOG.info('skipping %s, path `%s` not found', xml_file, eif_file)
            continue

        def _fn1(xml_path, eif_file):
            def call(self):
                generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0]
                expected_eif = json.load(open(eif_file))
                self.assertEqual(byteify(expected_eif), byteify(generated_eif))
            return call

        slug = xml_file.replace('-', '_').replace(' ', '').replace('/', '_')
        setattr(TestContent, 'test_eif_%s' % slug, _fn1(xml_path, eif_file))


    # handle partials

    def xml_fname_to_eif_partial(xml_fname, xml_path):
        return join(source_partial_dir, os.path.splitext(xml_fname)[0] + "-match.json")

    for xml_file, xml_path in xml_path_list.items():
        eif_path = xml_fname_to_eif_partial(xml_file, xml_path)

        if not os.path.exists(eif_path):
            LOG.info('skipping %s, path `%s` not found', xml_file, eif_path)
            continue

        generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0]
        # a list of maps with keys 'description' and 'data'
        eif_partial_tests = json.load(open(eif_path))

        for test in eif_partial_tests:
            if not test.has_key('description') or not test.has_key('data'):
                LOG.debug('description or data elements not found in file %r, skipping', eif_path)
                continue

            desc, expected_eif = test['description'], test['data']
            for element, expected_partial_eif in expected_eif.items():
                has_key = generated_eif.has_key(element)

                def _fn2(eif, expected_partial_eif):
                    def call(self):
                        self.assertTrue(has_all_keys(expected_partial_eif, ['description', 'data']))
                        self.assertEqual(byteify(expected_partial_eif), byteify(eif[element]))
                    return call

                slug = eif_path.replace('-', '_').replace(' ', '').replace('/', '_')
                setattr(TestContent, 'test_partial_%s' % slug, _fn2(xml_path, eif_file))
コード例 #54
0
ファイル: tests.py プロジェクト: benosment/bas
 def setUp(self):
     self.ribeye_url = "http://www.bonappetit.com/recipe/salt-and-pepper-rib-eye"
     self.cauliflower_url = "http://www.bonappetit.com/recipe/roasted-cauliflower-with-lemon-parsley-dressing"
     self.ribeye_scrape = scraper.scrape(self.ribeye_url)
     self.cauliflower_scrape = scraper.scrape(self.cauliflower_url)
コード例 #55
0
from pymongo import MongoClient
from scraper import scrape
import preprocess

if __name__ == '__main__':
	client = MongoClient()
	collection = client.twitter.tweets
	scrape('#Christmas',150000,collection)
	preprocess.count_date(collection.find())
	preprocess.analyze_term(collection.find())
	preprocess.to_geojson(collection.find({'coordinates': {'$exists':1}}))
	preprocess.construct_retweets_graph(collection.find())
	client.close()
	
コード例 #56
0
ファイル: classifier.py プロジェクト: hjfu/LinkedIngine
def initLocationClassifier():
    """Initialize Location Classifier"""
    cities = open('data/indiancities', 'r').readlines()
    classes = [({'name':city.split()[0]},city.split()[1]) for city in cities]
    return nltk.NaiveBayesClassifier.train(classes)

location_classifier = initLocationClassifier()

def classify():
    """Classify ALL the profiles in the database
    [TODO]: Allow classification to run only on selected list of profiles"""
    for profile in dbinterface.collection.find():
        first_name = profile['first_name']
        locality   = profile['locality'].split()[0]

        gender, area = None, None

        # Classifiers
        if not profile.has_key('gender') or not profile.has_key('area'):
            gender = gender_classifier.classify(gender_features(first_name))
            area = location_classifier.classify({'name':locality})

        dbinterface.collection.update({'public_profile_url':profile['public_profile_url']},
                                         {'$set': {'gender':gender, 'area':area}})

if __name__ == '__main__':
    page = open('reference.profile.2', 'r')
    import scraper
    resume = scraper.scrape(page, 'http://www.example.com/')
    classify()
コード例 #57
0
ファイル: main.py プロジェクト: tbridges42/XMLCleaner
def main():
    creds = config.get_creds()
    sftp.download(creds.get("sftp_url"), creds.get("sftp_username"), creds.get("sftp_password"), creds.get("localpath"))
    cleaner.clean(creds.get("localpath"))
    merge.merge(creds.get("localpath"))
    scraper.scrape(creds)
コード例 #58
0
ファイル: testscraper.py プロジェクト: narenaryan/Sticker
def first_scrape():
    fir_url_box=scrape('http://ukiyo-e.org/','//a[contains(@href,"artist")]/@href')
    print "i am in first"
    print fir_url_box
    return fir_url_box