Python scrapeの例、scraper.scrape Pythonの例

コード例 #1

0

ファイルを表示

ファイル: main.py プロジェクト: tmoi29/AliceCLS

def get_text_links():
    for entry in links:
        arr = links[entry]
        if "//" not in arr[0]:
            i = 0
            while i + 2 < len(arr):
                try:
                    title = scraper.scrape(arr[i + 2],
                                           arr[1])[0].strip().encode(
                                               'ascii', 'ignore') + " " + entry
                except:
                    print arr
                    print i + 2

                data.loc[title] = ["", 0]
                data.loc[title]["Link"] = arr[i + 2]
                txts = scraper.scrape(arr[i + 2], arr[0])

                for txt in txts:
                    words = txt.split()
                    for w in words:
                        w = ''.join(c for c in w if c not in punctuation)
                        if w.lower() in keywords:
                            data.loc[title]["Score"] += 1
                i += 1
        else:
            print entry

コード例 #2

0

ファイルを表示

ファイル: main.py プロジェクト: tmoi29/AliceCLS

def get_text_files():
    paths = glob.glob('files/*/*')
    for f in paths:
        source = f.split('/')[1]
        title = scraper.scrape_file(f, links[source][1])[0].strip().encode(
            'ascii', 'ignore') + ' ' + source
        data.loc[title] = ["", 0]
        for s in ["lexology", "natlawreview"]:
            if source == s:
                for link in links[s]:
                    if "//" in link:
                        t = scraper.scrape(link,
                                           links[s][1])[0].strip().encode(
                                               'ascii', 'ignore')
                        if t[:10].lower() in title.lower():
                            data.loc[title]['Link'] = link
                            break
        if source == "law360":
            for link in links["law360"]:
                if "//" in link:
                    t = scraper.scrape(link, "h1")[0].strip().encode(
                        'ascii', 'ignore')
                    if t[:10].lower() in title.lower():
                        data.loc[title]['Link'] = link
                        break

        txts = scraper.scrape_file(f, links[source][0])
        for txt in txts:
            words = txt.split()
            for w in words:
                w = ''.join(c for c in w if c not in punctuation)
                if w.lower() in keywords:
                    data.loc[title]["Score"] += 1

コード例 #3

0

ファイルを表示

ファイル: service.py プロジェクト: vadyur/script.media.aggregator

def scrape_nnm():
	settings = player.load_settings()
	data_path = settings.torrents_path()

	hashes = []
	for torr in filesystem.listdir(filesystem.join(data_path, 'nnmclub')):
		if torr.endswith('.torrent'):
			try:
				from base import TorrentPlayer
				tp = TorrentPlayer()
				tp.AddTorrent(filesystem.join(data_path, 'nnmclub', torr))
				data = tp.GetLastTorrentData()
				if data:
					hashes.append((data['announce'], data['info_hash'], torr.replace('.torrent', '.stat')))
			except BaseException as e:
				log.print_tb(e)

	for chunk in chunks(hashes, 32):
		import scraper
		try:
			seeds_peers = scraper.scrape(chunk[0][0], [i[1] for i in chunk])
		except RuntimeError as RunE:
			if '414 status code returned' in RunE.message:
				for c in chunks(chunk, 16):
					try:
						seeds_peers = scraper.scrape(c[0][0], [i[1] for i in c])
						process_chunk(c, data_path, seeds_peers)
					except BaseException as e:
						log.print_tb(e)
			continue
		except BaseException as e:
			log.print_tb(e)
			continue

		process_chunk(chunk, data_path, seeds_peers)

コード例 #4

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser(description='Bulk Saving Of Job Posts')
    parser.add_argument('-q', help='Search Term Query')
    parser.add_argument('-l', help='Location')
    parser.add_argument('-o', help='Output location')
    parser.add_argument('--json', help='Export Json File')
    parser.add_argument('--xlsx', help='Export Excel File')

    args = parser.parse_args()

    # print(args.accumulate(args.integers))
    if args.l == None:
        print "Must include location with: -l 'location'"
        exit
    if args.q == None:
        print "Must include search term with: -q 'job title'"
        exit

    if (args.q != None and args.l != None):
        print 'make this stuff happen'
        output = ''
        if (args.o == None):
            fileName = args.l.replace(' ', '-').replace(
                ',', '') + '-' + args.q.replace(' ', '-')
            output = os.getcwd() + '/' + fileName + '.xlsx'
        else:
            output = args.o
        scraper.scrape({
            'location': args.l,
            'search': args.q,
            'output': output
        })

コード例 #5

0

ファイルを表示

def concertScrape(limitOfNew):
    key = 'Dv4TzTBMtO5GJ57Dcrf0Jbxst8fEQHLx'
    secret = 'lgAN5MVcjmUM30rB'
    url = 'https://app.ticketmaster.com/discovery/v2/events.json?size=' + str(limitOfNew) + '&classificationName=concert&apikey=Dv4TzTBMtO5GJ57Dcrf0Jbxst8fEQHLx'
    
    contents = urllib2.urlopen(url).read()
    data = json.loads(contents)
    
    for i in range(len(data["_embedded"]["events"])):
        try:
            event = data["_embedded"]["events"][i]
            
            # Get all necessary info from json
            creator = event["promoter"]["name"]
            date = event["dates"]["start"]["localDate"]
            name = event["name"]
            url = event["images"][0]["url"]
            desc = event["url"]
            
            # Change from unicode str
            creator = scraper.uni_to_str(creator)
            date = scraper.uni_to_str(date)
            name = scraper.uni_to_str(name)
            url= scraper.uni_to_str(url)
            desc = scraper.uni_to_str(desc)
            
            print(creator, date, name, url, desc)
                    
            scraper.scrape(creator, name, desc, date, "concert", url)
        except:
            continue

コード例 #6

0

ファイルを表示

def main():
    creds = config.get_creds()
    sftp.download(creds.get("sftp_url"), creds.get("sftp_username"),
                  creds.get("sftp_password"), creds.get("localpath"))
    cleaner.clean(creds.get("localpath"))
    merge.merge(creds.get("localpath"))
    scraper.scrape(creds)

コード例 #7

0

ファイルを表示

def main():
  if len(sys.argv) <= 1:
    printUsageAndExit()

  animes = sys.argv[1:]
  anime_urls = [utils.getPageUrl(anime) for anime in animes]
  s.scrape(anime_urls)

コード例 #8

0

ファイルを表示

def predict(pages, team1, team2, m):
    #make sure we have data on these teams
    if not(isTop20(team1)):
        print("Team 1 is not a top 20 team")
        return
    if not(isTop20(team2)):
        print("Team 2 is not a top 20 team")
        return 
    if not(isMap(m)):
        print("The map is not in our pool")
        return
    if pages>0:
        print("Scraping process will take some time, please be patient")
        print("**********Scraping Map Results now**********")
        scrape(pages)
        print("**********Scraping Map Stats now**********")
        scrape_map_stats()
        print("**********Filtering Data now**********")
        filterCSV()
        print("**********Generating Data now**********")
    #Make sure we have a "filtered_top20.csv" file to examine (in case the user doesn't scrape)
    if os.path.isfile('filtered_top20.csv'):
        data = getDataReady()
        tree = build_tree_id3(data)
        #pp = pprint.PrettyPrinter(indent=4)
        #pp.pprint(tree)
        boolean = {True : team1, False : team2}
        print("{} would win.".format(boolean[classify(tree,userInputStats(team1, team2, m))]))  
    else:
        print('\"filtered_top20.csv\" was not found. Please scrape for data before attempting to predict')

コード例 #9

0

ファイルを表示

def test_scrape_raises_http_error():
    with mock.patch('scraper.urlopen') as urlopen_mock:
        urlopen_mock.side_effect = HTTPError('http://example.org', 404,
                                             'Not found', {}, mock.Mock())
        with pytest.raises(HTTPError) as exc:
            scraper.scrape('http://example.org')
        assert exc.value.code == 404
        assert exc.value.msg == 'Not found'

コード例 #10

0

ファイルを表示

ファイル: lambda_function.py プロジェクト: MarSavar/NewYorkerMusic

def lambda_handler(event, context):

    # Run scraping function
    scrape()

    # Query Spotify API, write into JSON file and upload it to S3
    query_spotify_api()

    print("The function ran successfully.")

コード例 #11

0

ファイルを表示

ファイル: main.py プロジェクト: sudiptog81/lhd-covid-scraper

def home():
    try:
        scraper.scrape()
        return 'Ran Successfully'
    except Exception as e:
        if (app.config['DEBUG'] == True):
            return str(e)
        else:
            return 'Error Encountered'

コード例 #12

0

ファイルを表示

ファイル: app.py プロジェクト: jdm79/headline-api

def print_headlines():
  # clear the list so we only get the latest headlines
  headlines.clear()

  # run this function over each paper
  for url in urls.values():
    scrape(url)
    
  # the function returns the final list
  return response

コード例 #13

0

ファイルを表示

ファイル: profilefetcher.py プロジェクト: mr4jay/LinkedIngine

def fetchProfiles(initURL, maxcount):
    """Given the URL from where to initiate the crawling, it first fetches the webpage, sends it to
    the crawler for scraping data from the webpage. Not only that, it also reads all the public profile
    urls present in the current page and adds them to the list. In subsequent iterations, it will fetch
    the LinkedIn profiles of people associated with these urls. The iteration continues for the number of
    times specified by maxcount"""
    count = 0
    links = set([initURL])
    waitinglist = list()

    start = datetime.now()

    while count < maxcount:
        count += 1

        while len(links) > 0:
            newreq = links.pop()
            if newreq not in waitinglist:  # If the url hasn't be used already, add it to the waiting list
                waitinglist.append(newreq)
                break

        try:
            page = urllib2.urlopen(waitinglist[-1]).read(
            )  # Fetch the web page from the url just appended
            scraper.scrape(
                page,
                waitinglist[-1])  # Send the page and the url for scraping

            if len(links) < 3:
                links.update(profileURL.findall(
                    page))  # Get all the urls present in this web page
        except:
            pass

        links = set([link.strip('"')
                     for link in links])  # String processing to remove quotes

        percentage = int(count * 100.0 / maxcount)  # Progress bar
        sys.stdout.write('\r' + '=' * percentage + '>' + ' ' *
                         (101 - percentage) + str(percentage) + '%')
        sys.stdout.flush()

    print 'Fetched', count, 'profiles in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    start = datetime.now()
    classifier.classify(
    )  # Classify all profiles in the database [TODO: classify only updated portion of db]
    print 'Classified all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    indexer.computeIndexes(
    )  # Compute indexes for every profile in the database [TODO: same as above]
    print 'Calculated indexes for all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'

コード例 #14

0

ファイルを表示

ファイル: main.py プロジェクト: KC2004/bidwire

def main():
    log.info("Starting Bidwire run")
    start = time.time()

    scraper.scrape()
    log.info("Scraping complete. Sending notifications.")
    new_bids = notifier.send_new_bids_notifications(EMAIL_RECIPIENTS)
    elapsed_secs = time.time() - start

    log.info("Notification sending complete. Sending debug email.")
    DebugEmail().send(new_bids, EMAIL_RECIPIENTS, elapsed_secs)

コード例 #15

0

ファイルを表示

def main():
    log.info("Starting Bidwire run")
    start = time.time()

    scraper.scrape(SITE_CONFIG)
    log.info("Scraping complete. Sending notifications.")
    new_bids = notifier.send_new_notifications(SITE_CONFIG)
    elapsed_secs = time.time() - start

    log.info("Notification sending complete. Sending debug email.")
    DebugEmail().send(new_bids, SITE_CONFIG, elapsed_secs)

コード例 #16

0

ファイルを表示

ファイル: companies.py プロジェクト: nickwu241/glass-heaven

def scrape_companies_data(
    company_names: List[str],
    use_cache: bool = False,
    n: int = 2147483647,
    skip_companies: Set[str] = set()
) -> Tuple[List[Company], List[FailedCompanyError]]:
    errors = []
    output_data = []

    for i, company_name in enumerate(company_names):
        if i >= n:
            break

        if company_name in skip_companies:
            print(f'[INFO] Skip scraping {company_name}')
            continue

        try:
            company_id = company_name.replace(' ', '_').lower()
            company = Company(id=company_id)
            overview_url, reviews_url = scraper.get_glassdoor_urls(
                company_name)
            print('[INFO]', company_name, overview_url, reviews_url)
            if overview_url is None or reviews_url is None:
                raise Exception(
                    f'Cannot find both URLs for "{company_name}": {overview_url} {reviews_url}'
                )

            reviews_data = scraper.scrape(reviews_url,
                                          f'{company_name}_reviews.html',
                                          scraper.get_reviews_data)
            overview_data = scraper.scrape(overview_url,
                                           f'{company_name}_overview.html',
                                           scraper.get_overview_data)
            data = {
                'name': company_name,
                'overview_url': overview_url,
                'reviews_url': reviews_url,
                'linkedin_url': scraper.get_linkedin_url(company_name),
            }
            data.update(reviews_data)
            data.update(overview_data)
            company.update_data(data)
            output_data.append(company)
        except Exception as e:
            print(f'[FAIL] caught exception when parsing "{company_name}"')
            errors.append(
                FailedCompanyError(
                    company_name=company_name,
                    exception=e,
                ))

    return output_data, errors

コード例 #17

0

ファイルを表示

ファイル: app.py プロジェクト: code-pack/VTU-Result-Scraper-with-CAPTCHA-Bypass

 def get(self, college, year, branch, low, high, semc):
     scraper.scrape(college, year, branch, low, high, semc)
     filename = 'ExcelFiles/' + '1' + college + year + branch + low + '-' + high
     extension = '.xls'
     zipf = zipfile.ZipFile('Results-Excel.zip', 'w', zipfile.ZIP_DEFLATED)
     files = [
         filename + extension, filename + 'GPA' + extension,
         filename + 'RANK' + extension
     ]
     for file in files:
         zipf.write(file)
     zipf.close()
     return send_from_directory('', 'Results-Excel.zip')

コード例 #18

0

ファイルを表示

def start(parameters):

    try:
        db_connection = connector.connect(user='******',
                                          password='******',
                                          host='localhost',
                                          database='EAGLEEYE')

        scraper.scrape(parameters, db_connection)
    except:

        time.sleep(5)
        print("Database down, trying to connect...")
        start(parameters)

コード例 #19

0

ファイルを表示

 def test_scrape(self, source):
     test_limit = 3
     web_df = scrape(
         source=source,
         limit=test_limit,
         test=True,
         since=str(datetime.datetime.now().date() - datetime.timedelta(7)),
     )
     self.assertEqual(len(web_df), test_limit)
     web_df = scrape(source=source,
                     limit=test_limit,
                     test=True,
                     since="2019-09-17")
     self.assertEqual(len(web_df), test_limit)

コード例 #20

0

ファイルを表示

ファイル: model.py プロジェクト: declankirk/breadtubebot2

def update_model():
    print("Scraping posts...")
    scrape(500000)

    print("Building model...")
    with open('titles.txt', encoding='utf8') as f:
        titles = f.read()
    model = markovify.NewlineText(titles)

    print("Exporting model...")
    model_json = model.to_json()
    with open('model.json', 'w') as f:
        f.write(model_json)
    
    print("Done!")

コード例 #21

0

ファイルを表示

def read_majors(
        game_id=int(db_game), year=int(db_year), base=None, current=False):
    set_readin_args(args)
    #slugs = ["genesis-5","summit6","shine2018","tbh8","summit7"]
    fails = []
    scrape_load = False
    slug_given = False
    if db_slug == None:
        if to_load_slugs:
            scrape_load = True
            if v >= 3 and year == int(db_year):
                print('Loading saved slugs...')
            slugs = load_slugs(game_id, year)
            if slugs == False or slugs == []:
                if v >= 3:
                    print('Saved slugs not found.')
                slugs = scraper.scrape(game_id, year, v)
                scrape_load = False
        else:
            slugs = scraper.scrape(game_id, year, v)
        fails = [event[1] for event in slugs if type(event) is tuple]
        slugs = [event for event in slugs if type(event) is str]
    elif type(db_slug) is list:
        slugs = db_slug
        slug_given = True
    else:
        #print(type(db_slug))
        slugs = [db_slug]
        slug_given = True
    if v >= 3 and not scrape_load and not slug_given:
        if len(slugs) <= 0:
            print('No slugs found for game %d in year %d:' % (game_id, year))
        else:
            print('Scraped the following slugs for game %d in year %d:' %
                  (game_id, year))
            print(slugs)
    if not fails == [] and v > 0:
        print(
            'The following majors could not be read (no smash.gg bracket found)'
        )
        print(fails)
    if to_save_db and not scrape_load and not slug_given:
        save_slugs(slugs, game_id, year, to_save_db=to_save_db)
    return (read_tourneys(slugs,
                          ver=game_id,
                          year=year,
                          base=base,
                          current=current))

コード例 #22

0

ファイルを表示

ファイル: endpoints.py プロジェクト: Sciguystfm/Dining-Data

def handle_scrape():
    username = request.authorization.username
    password = request.authorization.password
    sdate = request.args.get("sdate")
    edate = request.args.get("edate")
    try:
        scrape(username, password, sdate, edate)
    except NoSuchElementException as e:
        return jsonify({
            "status": "failure"
        }),400


    return jsonify({
        "status": "success"
    }), 200

コード例 #23

0

ファイルを表示

ファイル: renamer.py プロジェクト: wtcurtis/TVFilter

def renameAll(rootDir, metadataFile):
	try:
		season = os.path.basename(rootDir)
		metadata = scraper.scrape(open(metadataFile))
		metadata
	#	print metadata
	#	sys.exit()
	
	except ValueError:
		print "Couldn't parse the season from the given."
		return 0
	
	files = os.listdir(rootDir)

	parsedFiles = []
	pattern = re.compile(r"([\w ]*) - (\d\d)x(\d\d)(.*)")

	for f in files:
		matches = pattern.search(f).groups()
		orgFile = os.path.join(rootDir, f)
		show = matches[0]
		season = int(matches[1])
		episode = int(matches[2])

		newName = '{0} - {1:0>2}x{2:0>2} - {3}{4}'.format(show, season, episode, metadata[season][episode], matches[3])
		
		result = raw_input('Moving {0} to {1}. Continue? (y/n)'.format(f, newName))
		if(result == 'y'):
			os.rename(orgFile, os.path.join(rootDir, newName))
		parsedFiles.append({'file': orgFile, 'newPath': os.path.join(rootDir, newName)})
	
	return parsedFiles

コード例 #24

0

ファイルを表示

ファイル: default.py プロジェクト: tuffnerdstuff/xbmc-plugins

def buildVideoIndex(url):
        data=scraper.scrape(url)
	nextLinkUrl=scraper.scrapeNextPageLink(url)
        for name,info_url,img,date in data:
                addLink(name,info_url,3,img)
	if (nextLinkUrl != None):
		addDir("[Next Page >>]",nextLinkUrl,2,'')

コード例 #25

0

ファイルを表示

def search_scene():
    global search, running, show_books
    while search:
        # Ovoa e search scene, tuka e input box i unasa ime na knigata
        screen.fill(GRAY)
        search_box.update()
        search_box.draw(screen)
        pygame.display.flip()
        for ev in pygame.event.get():
            if ev.type == pygame.QUIT:
                # Tuka treba som izleze od search scene da izgase celoto ama nekje taka nesto
                running = False
                search = False
            # Od klasata, proveruva dali si kliknal na s_box, enter, backspace i so tekst upisuvas
            search_box.handle_event(ev)
            if search_box.enter:
                search_book = search_box.rText
                show_books = True
                search = False
                for book_dict in scraper.scrape(search_book):
                    # Od scrapero stava rezultatite u 2 arrays
                    titles.append(book_dict["book"])
                    authors.append(book_dict["author"])
                    book_ids.append(book_dict["book_id"])
                    downloads.append(book_dict["downloads"])
    return images, titles

コード例 #26

0

ファイルを表示

def scraper():
    #delete_all_potential_locations()
    loopnetListings = scrape()
    j = 0
    name_set = {}
    for index, row in loopnetListings.iterrows():
        address_map = {}
        address = row[0]
        address_map['address'] = address
        components = address.split(",")
        try:
            address_map['street'] = components[0]
            address_map['city'] = components[1]
            address_map['state'] = components[2]
        except:
            print("Exception: invalid format of address")
            continue
        name = row[1]
        if name_set.get(name) == None:
            name_set[name] = 1
        else:
            name = name + " " + str(name_set.get(name))
        lat, lon = get_lat_long(address)
        try:
            store_scraped_in_google(address_map, name, lat, lon)
        except:
            print("Exception: Could not store in Google")

コード例 #27

0

ファイルを表示

 def test_english_detection(self):
     from translation import Translator
     my_translator = Translator(None)
     result = scraper.scrape("http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNFY1KzEAhaiZchzd5ulmoY4_4P8kA&url=http://vov.vn/Van-hoa/NSND-Thanh-Hoa-xuc-dong-hat-truoc-benh-nhan/228256.vov")
     self.assertFalse(result.get('unscrapable'))
     text_obj = process_resources.extract_clean_content(result['htmlContent'])
     self.assertFalse(my_translator.is_english(text_obj['content']))

コード例 #28

0

ファイルを表示

 def scrape_thread(cur2, pbar, count, qhashs, nth, total, ip="open.demonii.com"):
     db = MySQLdb.connect(**config.mysql)
     cur = db.cursor()
     last_commit = time.time()
     errno=0
     try:
         l = qhashs.get(timeout=0)
         while True:
             try:
                 for hash, info in scraper.scrape("udp://%s:1337/announce" % ip, l).items():
                     cur.execute("UPDATE torrents SET scrape_date=NOW(), seeders=%s, leechers=%s, downloads_count=%s WHERE hash=%s", (info['seeds'], info['peers'], info['complete'], hash))
                 if time.time() - last_commit > 30:
                     db.commit()
                     last_commit = time.time()
                 pbar.update(min(pbar.currval + len(l), count))
                 l = qhashs.get(timeout=0)
                 errno=0
             except (socket.timeout, socket.gaierror, socket.error):
                 db.commit()
                 time.sleep(0.1 * errno + 0.1)
                 errno+=1
                 if errno > 10:
                     raise
     except queue.Empty:
         pass
     finally:
         db.commit()
         cur.close()
         db.close()

コード例 #29

0

ファイルを表示

ファイル: app.py プロジェクト: wi11/Intern.find

def display_tables():
    max_results_per_state = 10
    state_set = ["California", "New York", "Washington", "Illinois", "Texas"]
    data = {"Job Title":[], "Company":[], "Location":[]}
    dataframe = scraper.scrape(max_results_per_state, state_set, data)
    print "HI"
    return render_template("table.html", dataframe=dataframe.to_html())

コード例 #30

0

ファイルを表示

def get_torrent_stats(url):
    response = requests.get(url)
    data = bencodepy.decode(response.content)
    files = data[b'info'][b'files']
    size = 0
    for file in files:
        size += file[b'length']
    size = size * 1e-9
    size = round(size, 2)
    info_hash = hashlib.sha1(bencodepy.bencode(data[b"info"])).hexdigest()

    trackers_list = data[b'announce-list']

    stats = {'seeds': 0, 'peers': 0}

    for tracker_url in trackers_list:
        tracker_url = tracker_url[0].decode('utf-8')
        result = scrape(tracker_url, [info_hash])
        if not result:
            continue
        if result[info_hash]['seeds'] is None or result[info_hash][
                'peers'] is None:
            continue
        stats['seeds'] = max(stats['seeds'], result[info_hash]['seeds'])
        stats['peers'] = max(stats['peers'], result[info_hash]['peers'])
    stats['size_gb'] = size
    return stats

コード例 #31

0

ファイルを表示

ファイル: app.py プロジェクト: aphrx/jabber

def scheduled():

    # print out time that cron job was deployed in console
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print("Cron Job Executed at " + dt_string)

    # Find users who signed up for cron job
    users = mongo.db.users.find({'cron': {"$exists": True}})

    # For each user who has signed up for cron job, find jobs they are interested ibn
    for u in users:
        test = scraper.scrape()
        jobs = []
        employer = []
        links = []
        jobs, employer, links, count = test.search(u['cron']['cron_job'],
                                                   u['cron']['cron_loc'], True)
        j = jobbankapply.apply(links)
        emails, jobs, employer = j.run()

        # find user's cv and resume
        cv_data = u['cv']
        cv_data = cv_data.encode('latin-1', 'replace').decode('latin-1')
        resume = u['resume']
        resume = resume.encode('latin-1', 'replace').decode('latin-1')

        # apply for jobs on behalf of user
        j.email(emails, jobs, employer, cv_data, resume, u['id'], u['email'])

コード例 #32

0

ファイルを表示

ファイル: web2epub.py プロジェクト: gauravmm/Web2ePub

def run(args):
	styler = getStyle(args.style[0]);
	if not styler:
		raise RuntimeError("Cannot find output style " + args.style + ".");
	print "Loaded Style: \t" + styler.name + ".";

	print "Scraping Website";
	web = scrape(args.url, styler, cache=(not args.no_cache));

	if len(web[1]) == 0:
		raise RuntimeError("No valid pages found!");

	web = (styler.edit_book_metadata(web[0]), web[1], web[2]);
	
	fout = title_fn(web[0]["author"] + " - " + web[0]["title"]);
	if args.out and len(args.out) > 0 and len(args.out[0]) > 0:
		# If args.out is a directory, then we use our custom filename, otherwise
		# we use the given filename.
		if args.out[0][-1] == "/":
			fout = args.out[0] + fout;
		else:
			fout = args.out[0];

		if not fout.lower().endswith(".epub"):
			fout = fout + ".epub";
	
	if args.no_overwrite and path.exists(fout):
		print "Skipped! \"" + fout + "\" exists.";
	else:
		print "Building ePub";
		epub(web, fout, styler, args);
		
		print "Done! Written output to \"" + fout + "\"";

コード例 #33

0

ファイルを表示

ファイル: ml.py プロジェクト: eoershova/vk-sentiment

def async_handler():
    conn = sqlite3.connect('test2.db', check_same_thread=False)
    c = conn.cursor()
    while True:
        c.execute(
            "SELECT token, period, request_id FROM test WHERE score = 'unready'"
        )
        request = c.fetchone()

        if request is not None:
            print('ПРИНИМАЮСЬ')
            request_id = request[2]
            print(type(request_id))
            date = request[1]
            token = request[0]
            dates = period(date.split(" ")[0], date.split(" ")[1])
            posts = scrape(dates, token)
            comments = get_comments(token, posts, dates, request_id)
            result = predict(comments)
            predictions_df = result[1]
            c.execute("UPDATE test SET score = ? where request_id = ?",
                      (result[0], request_id))
            c.execute("UPDATE test SET comments = ? where request_id = ?",
                      (predictions_df, request_id))
            conn.commit()
            print('УПРАВИЛСЯ')
            continue
        time.sleep(0.5)

コード例 #34

0

ファイルを表示

def mooving(ticker):

    """
This function retrieves the dataframe created by the scrape function.
It runs mooving_average on it to predict tomorrows prices.

    """
    df = scrape(ticker)
    df_cut = pd.DataFrame(
        df, columns=['date', 'closing', 'SMA', 'EMA_Short', 'EMA_Long'])
    df_cut.closing = np.around(df_cut.closing, decimals=2)

    window = round(len(df_cut)*0.2)

    #Simple Moving Average

    preds=[]
    for i in range(window):
        x = df_cut.closing[(len(df_cut) - 2*window + i):(len(df_cut) - window + 1)].sum() + sum(preds)
        x_mean = x/window
        preds.append(np.around(x_mean, decimals=2))
        df_cut['SMA'][len(df_cut) - window + i] = preds[i]

    rms_sma=np.sqrt(mean_squared_error(np.array(df_cut['closing'][(len(df_cut) - window) :]), np.array(df_cut['SMA'][(len(df_cut) - window) :])))

    #Tomorrow's predicted price
    num=0
    denom=0
    for j in range(window):
        num+= j*(df_cut.closing[len(df_cut) - window + j])
        denom+= j
    pred_weighted = num/denom

    print(df_cut)
    return pred_weighted

コード例 #35

0

ファイルを表示

def all():
    a = scrape()
    return jsonify({
        "head": "from covid-19-generator",
        "status": 200,
        "body": a
    })

コード例 #36

0

ファイルを表示

ファイル: profilefetcher.py プロジェクト: hjfu/LinkedIngine

def fetchProfiles(initURL, maxcount):
    """Given the URL from where to initiate the crawling, it first fetches the webpage, sends it to
    the crawler for scraping data from the webpage. Not only that, it also reads all the public profile
    urls present in the current page and adds them to the list. In subsequent iterations, it will fetch
    the LinkedIn profiles of people associated with these urls. The iteration continues for the number of
    times specified by maxcount"""
    count = 0
    links = set([initURL])
    waitinglist = list()

    start = datetime.now()

    while count< maxcount:
        count += 1

        while len(links) > 0:
            newreq = links.pop()
            if newreq not in waitinglist:   # If the url hasn't be used already, add it to the waiting list
                waitinglist.append(newreq)
                break

        try:
            page = urllib2.urlopen(waitinglist[-1]).read() # Fetch the web page from the url just appended
            scraper.scrape(page, waitinglist[-1]) # Send the page and the url for scraping

            if len(links) < 3:
                links.update(profileURL.findall(page)) # Get all the urls present in this web page
        except:
            pass

        links = set([link.strip('"') for link in links]) # String processing to remove quotes

        percentage = int(count*100.0/maxcount)    # Progress bar
        sys.stdout.write('\r'+'='*percentage+'>'+' '*(101-percentage) +str(percentage)+'%')
        sys.stdout.flush()

    print 'Fetched', count, 'profiles in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    start = datetime.now()
    classifier.classify() # Classify all profiles in the database [TODO: classify only updated portion of db]
    print 'Classified all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'

    indexer.computeIndexes() # Compute indexes for every profile in the database [TODO: same as above]
    print 'Calculated indexes for all profiles in database in', \
     (datetime.now() - start).total_seconds(), 'seconds'

コード例 #37

0

ファイルを表示

ファイル: nextride.py プロジェクト: DrashtiBhatt/NextRide

def mobile():
    stop = request.args.get("stop", 1, type=int)
    schedule = scrape(stop)
    if schedule:
        response = dict(meta=dict(status=200, message="OK"), data=schedule)
    else:
        abort(400)
    return render_template("m.html", path=response)

コード例 #38

0

ファイルを表示

ファイル: nextride.py プロジェクト: DrashtiBhatt/NextRide

def api():
    stop = request.args.get("stop", 1, type=int)
    schedule = scrape(stop)
    if schedule:
        response = jsonify(meta=dict(status=200, message="OK"), data=schedule)
    else:
        abort(400)
    return response

コード例 #39

0

ファイルを表示

ファイル: testscraper.py プロジェクト: narenaryan/Sticker

def second_scrape():
    fir_url_box=list(first_scrape())
    print "i am in second"
    sec_url_box=[]
    for ele in fir_url_box:
        print ele
        for item in scrape(ele,'//a[@class="img"]/@href'):
            yield item

コード例 #40

0

ファイルを表示

ファイル: api.py プロジェクト: kshvmdn/cdn-university-api

def api(code):
    try:
        data = scrape(code)
    except:
        data = None

    if not data:
        abort(404, {'message': 'Couldn\'t retrieve data for program %s.' % code})

    return jsonify(meta=dict(status=200, message='OK'), data=data)

コード例 #41

0

ファイルを表示

ファイル: bordertimes.py プロジェクト: kshvmdn/border-times

def single(port):
    try:
        wait_times = scrape(port)
    except KeyError:
        abort(404, {'message': 'Invalid `port` value.'})

    if not wait_times:
        abort(400)

    return jsonify(meta=dict(status=200, message='OK'), data=wait_times)

コード例 #42

0

ファイルを表示

ファイル: feeds.py プロジェクト: code56/jats-scraper

def scrape(docs_dir, process=None, article_version=None):
    if docs_dir is not None:
        import scraper
        mod = __import__(__name__)
        res = scraper.scrape(mod, doc=docs_dir, article_version=article_version)
        if process:
            res = process(res)

        import json
        res = json.dumps(res, indent=4, ensure_ascii = False)
        return res.encode('utf8')

コード例 #43

0

ファイルを表示

ファイル: main.py プロジェクト: Manuel87/textual-tools

def web_count(name, levels, out=None):
    folder = scrape(name, levels, out)
    file_list = folder_reader(folder)
    count = Counter()
    count.name = folder
    count.source = name
    if file_list == None:
        return
    else:
        for file in file_list:
            count.count(folder + "/" + file)
    if out == None:
        out = os.getcwd()
    writer(out + "/report.JSON", json_formulate(count))

コード例 #44

0

ファイルを表示

ファイル: db.py プロジェクト: barrylavides/New-Movies

def insert_movie():
    url = 'http://www.imdb.com/movies-coming-soon'
    url_exist = mongo.db.movies.find_one({'url': url})
    action = ''

    if url_exist == None:
        movies = mongo.db.movies.insert_many(scraper.scrape(url))
        action = 'Scrape movies and add to database'
    else:
        pass
        # Get movies from database
        action = 'Get movies from database'

    return action

コード例 #45

0

ファイルを表示

ファイル: app.py プロジェクト: Nyubis/kul-calendar

def lookup(coursestring):
    courses = coursestring.split("+")
    entries_by_day = {}
    for course in courses:
        # check whether the user provided a valid day, otherwise use today
        day_param = request.args.get('day')
        if day_param is not None and verify_date_param(day_param):
            coursedata = scraper.scrape(course, day_param)
        else:
            coursedata = scraper.scrape(course)
        # this data is from one particular course
        # we take the individual moments and put them in the dict, separated by day
        bucketadd(entries_by_day, "day", coursedata)

    # this will end up containing Weekday objects, which also contain the courses for that day
    weekdays_with_courses = [] 
    for date, entries in entries_by_day.items():
        weekdays_with_courses.append(Weekday(date, entries))

    # sort the Weekdays based on their weekindex, so monday comes first and sunday last
    sorted_data = sorted(weekdays_with_courses, key=lambda x: x.weekindex)
    print(sorted_data)
    return render_template("lookup.html", days=sorted_data)

コード例 #46

0

ファイルを表示

ファイル: base.py プロジェクト: vadyur/script.media.aggregator

def scrape_now(fn):
	debug(fn)
	tp = TorrentPlayer()
	tp.AddTorrent(fn)
	data = tp.GetLastTorrentData()
	debug(str(data))
	if data:
		hashes = [data['info_hash']]
		import scraper

		res = scraper.scrape(data['announce'], hashes)
		debug(str(res))
		return res[data['info_hash']]
	else:
		return {}

コード例 #47

0

ファイルを表示

ファイル: feed.py プロジェクト: henrytrager/openbay-crawler

 def scrape_thread(cur2, pbar, count, qhashs, nth, total, ips=["open.demonii.com"]):
     db = MySQLdb.connect(**config.mysql)
     cur = db.cursor()
     last_commit = time.time()
     errno=0
     nip = len(ips)
     banip=collections.defaultdict(int)
     i = nth
     try:
         l = qhashs.get(timeout=0)
         while True:
             try:
                 ip = ips[i%len(ips)]
                 i+=1
                 for hash, info in scraper.scrape("udp://%s:1337/announce" % ip, l).items():
                     cur.execute("UPDATE torrents SET scrape_date=NOW(), seeders=%s, leechers=%s, downloads_count=%s WHERE hash=%s", (info['seeds'], info['peers'], info['complete'], hash))
                 if time.time() - last_commit > 30:
                     db.commit()
                     last_commit = time.time()
                 pbar.update(min(pbar.currval + len(l), count))
                 l = qhashs.get(timeout=0)
                 errno=0
             except (socket.timeout, socket.gaierror, socket.error):
                 db.commit()
                 banip[ip]+=1
                 if banip[ip]>3:
                     try:ips.remove(ip)
                     except ValueError:
                         pass
                 if not ips:
                     raise ValueError("all ips failed")
                 time.sleep(0.1 * errno + 0.1)
                 errno+=1
                 if errno > nip*3:
                     raise
     except (queue.Empty, ZeroDivisionError):
         pass
     except (socket.timeout, socket.gaierror, socket.error):
         qhashs.put(l)
     except (ValueError, RuntimeError) as e:
         print e
     finally:
         db.commit()
         cur.close()
         db.close()

コード例 #48

0

ファイルを表示

ファイル: views.py プロジェクト: bgheneti/PythonPictoscrape

def createURL(request):
	latest_fanfic_list = FanFic.objects.all().order_by('-pub_date')[:11]

	if request.method == 'POST': 
		form = CreateURLForm(request.POST)
		if form.is_valid():
			new_fanfic = form.save(commit=False)
			#banti's code -- scraping from url
			d = scraper.scrape(form.cleaned_data['url'])
			new_fanfic.title = d['title']
			new_fanfic.author = d['author']
			new_fanfic.text = d['text']
			new_fanfic.fandom = d['fandom']
			if d['text'] == '':
				new_fanfic.text = d['summary']
			print "text of fanfic: " + new_fanfic.text
			#alyssa's code -- getting keywords
			kwlist = my_immortal_keyword_finder.getwords(new_fanfic.text)
			try:
				new_fanfic.profile=str(image_return.googlePrep(d['fandom']))
			except:
				print "F**K the profile picture"
			new_fanfic.save()
			for kw in kwlist:
				kw = kw.strip()
				#banti's code -- getting image urls 
				try:
					new_fanfic.keyword_set.create(key_word=kw, image_url=str(image_return.googlePrep(kw)))
				except:
					print kw + "is f****d"
			return HttpResponseRedirect('/fanfics/'+ str(new_fanfic.id)) # Redirect after POST
	else:
		form = CreateURLForm() # An unbound form

	return render(request, 'fanfics/createURL.html', {
        'form': form,
        'latest_fanfic_list':latest_fanfic_list
    })

コード例 #49

0

ファイルを表示

ファイル: testscraper.py プロジェクト: narenaryan/Sticker

def third_scrape():
    print "i am in third"
    third_url_box=[]
    for ele in second_scrape():
        for item in scrape(ele,'//a[contains(@href,"images")]/@href'):
            yield item

コード例 #50

0

ファイルを表示

ファイル: main.py プロジェクト: amsully/mapmyrun_webscraper

import csv,scraper
with open('Alexs_workout_history.csv') as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
		scraper.scrape(row[14])

コード例 #51

0

ファイルを表示

ファイル: main.py プロジェクト: undercase/scheduler

"""
Main Execution File for Scheduler
"""

from scraper import scrape
from ical import make_calendar

if __name__ == "__main__":
    make_calendar(scrape())
    print("Your calendar has been saved to this directory as 'UNT_schedule.ics'.")

コード例 #52

0

ファイルを表示

ファイル: test_content.bizarro.py プロジェクト: elifesciences/jats-scraper

 def call(self):
     generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0]
     expected_eif = json.load(open(eif_file))
     self.assertEqual(byteify(expected_eif), byteify(generated_eif))

コード例 #53

0

ファイルを表示

ファイル: test_content.bizarro.py プロジェクト: elifesciences/jats-scraper

def inject_methods():
    this_dir = os.path.abspath(os.path.dirname(__file__))
    source_xml_dir = join(this_dir, 'JATS')
    source_eif_dir = join(this_dir, 'EIF')
    source_partial_dir = join(this_dir, 'EIF', 'partial')

    # returns a map of {fname: /path/to/fname, ...} for given `dir`
    def path_map(parent):
        paths = map(lambda fname: join(parent, fname), os.listdir(parent))
        paths = filter(os.path.isfile, paths)
        return dict(zip(map(os.path.basename, paths), paths))

    # creates absolute paths to the EIF fixtures
    xml_path_list = path_map(source_xml_dir)
    eif_path_list = path_map(source_eif_dir)
    partial_eif_path_list = path_map(source_partial_dir)

    def xml_fname_to_eif(xml_fname, xml_path):
        return join(source_eif_dir, os.path.splitext(xml_fname)[0] + ".json")

    for xml_file, xml_path in xml_path_list.items():
        eif_file = xml_fname_to_eif(xml_file, xml_path)

        if not os.path.exists(eif_file):
            LOG.info('skipping %s, path `%s` not found', xml_file, eif_file)
            continue

        def _fn1(xml_path, eif_file):
            def call(self):
                generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0]
                expected_eif = json.load(open(eif_file))
                self.assertEqual(byteify(expected_eif), byteify(generated_eif))
            return call

        slug = xml_file.replace('-', '_').replace(' ', '').replace('/', '_')
        setattr(TestContent, 'test_eif_%s' % slug, _fn1(xml_path, eif_file))


    # handle partials

    def xml_fname_to_eif_partial(xml_fname, xml_path):
        return join(source_partial_dir, os.path.splitext(xml_fname)[0] + "-match.json")

    for xml_file, xml_path in xml_path_list.items():
        eif_path = xml_fname_to_eif_partial(xml_file, xml_path)

        if not os.path.exists(eif_path):
            LOG.info('skipping %s, path `%s` not found', xml_file, eif_path)
            continue

        generated_eif = scraper.scrape(feeds, doc=xml_path)[0]['article'][0]
        # a list of maps with keys 'description' and 'data'
        eif_partial_tests = json.load(open(eif_path))

        for test in eif_partial_tests:
            if not test.has_key('description') or not test.has_key('data'):
                LOG.debug('description or data elements not found in file %r, skipping', eif_path)
                continue

            desc, expected_eif = test['description'], test['data']
            for element, expected_partial_eif in expected_eif.items():
                has_key = generated_eif.has_key(element)

                def _fn2(eif, expected_partial_eif):
                    def call(self):
                        self.assertTrue(has_all_keys(expected_partial_eif, ['description', 'data']))
                        self.assertEqual(byteify(expected_partial_eif), byteify(eif[element]))
                    return call

                slug = eif_path.replace('-', '_').replace(' ', '').replace('/', '_')
                setattr(TestContent, 'test_partial_%s' % slug, _fn2(xml_path, eif_file))

コード例 #54

0

ファイルを表示

ファイル: tests.py プロジェクト: benosment/bas

 def setUp(self):
     self.ribeye_url = "http://www.bonappetit.com/recipe/salt-and-pepper-rib-eye"
     self.cauliflower_url = "http://www.bonappetit.com/recipe/roasted-cauliflower-with-lemon-parsley-dressing"
     self.ribeye_scrape = scraper.scrape(self.ribeye_url)
     self.cauliflower_scrape = scraper.scrape(self.cauliflower_url)

コード例 #55

0

ファイルを表示

ファイル: main.py プロジェクト: michaelzhou0723/visualizetwitter

from pymongo import MongoClient
from scraper import scrape
import preprocess

if __name__ == '__main__':
	client = MongoClient()
	collection = client.twitter.tweets
	scrape('#Christmas',150000,collection)
	preprocess.count_date(collection.find())
	preprocess.analyze_term(collection.find())
	preprocess.to_geojson(collection.find({'coordinates': {'$exists':1}}))
	preprocess.construct_retweets_graph(collection.find())
	client.close()

コード例 #56

0

ファイルを表示

ファイル: classifier.py プロジェクト: hjfu/LinkedIngine

def initLocationClassifier():
    """Initialize Location Classifier"""
    cities = open('data/indiancities', 'r').readlines()
    classes = [({'name':city.split()[0]},city.split()[1]) for city in cities]
    return nltk.NaiveBayesClassifier.train(classes)

location_classifier = initLocationClassifier()

def classify():
    """Classify ALL the profiles in the database
    [TODO]: Allow classification to run only on selected list of profiles"""
    for profile in dbinterface.collection.find():
        first_name = profile['first_name']
        locality   = profile['locality'].split()[0]

        gender, area = None, None

        # Classifiers
        if not profile.has_key('gender') or not profile.has_key('area'):
            gender = gender_classifier.classify(gender_features(first_name))
            area = location_classifier.classify({'name':locality})

        dbinterface.collection.update({'public_profile_url':profile['public_profile_url']},
                                         {'$set': {'gender':gender, 'area':area}})

if __name__ == '__main__':
    page = open('reference.profile.2', 'r')
    import scraper
    resume = scraper.scrape(page, 'http://www.example.com/')
    classify()

コード例 #57

0

ファイルを表示

ファイル: main.py プロジェクト: tbridges42/XMLCleaner

def main():
    creds = config.get_creds()
    sftp.download(creds.get("sftp_url"), creds.get("sftp_username"), creds.get("sftp_password"), creds.get("localpath"))
    cleaner.clean(creds.get("localpath"))
    merge.merge(creds.get("localpath"))
    scraper.scrape(creds)

コード例 #58

0

ファイルを表示

ファイル: testscraper.py プロジェクト: narenaryan/Sticker

def first_scrape():
    fir_url_box=scrape('http://ukiyo-e.org/','//a[contains(@href,"artist")]/@href')
    print "i am in first"
    print fir_url_box
    return fir_url_box