def main(): scrape.main() step = 25 # how many pixels we'll jump over, the higher it is, the more HD the sub-image will appear targetWidth = 2500 # how big output image will be folder = 'Scraped Images' # folder to get images from imageDict = load_images(folder, dimension=(step, step)) # load images to paste on imageFile = 'gia.jpg' # image we'll be making a photo mosaic out of editedImage = photo_mosaic(imageFile, imageDict=imageDict, step=step, targetWidth=targetWidth) # get a photo mosaic editedImage.show() # view image save_image(editedImage, imageFile) # save image
def main(teams, delay, lock): start = time.time() scrape.main(teams, lock) # go on forever :: request after each delay while True: # wait for some time if (time.time() - start) > delay: print "CALL SCRAPE.MAIN() AGAIN." scrape.main(teams, lock) start = time.time() print time.ctime(start), " time after delay call to scrape." # if __name__ == "__main__": # teams = ["mexico","uruguay"] # main(teams, 10, Tlock)
def application(): print request companies = request.args.get('companies') print companies companylist = companies.split(',') blogs = main(companies) # blogs.tags = json.dumps(blogs.tags) return render_template('application.html', companies=companylist, blogs=blogs)
def test_quote_extractions(): # TODO: Don't curl the website, instead pull from the checked in HTML file! for i, quote in enumerate(scrape.main(year=2015, month=1, page=20)): fname = join(dirname(__file__), 'data/{}.json'.format(i)) with open(fname, 'r') as f: expected = f.read() observed = quote.json() assert expected == observed
def get_recent_changes(links, done_links): """Scrape links on the Special:RecentChanges page.""" _, _, _, recent_links = S.main('Special:RecentChanges') starting_recent_links_num = len(recent_links) # recent_links = recent_links.difference(done_links) # Bad! What if updated? recent_links = recent_links.difference(links) links.update(recent_links) len(recent_links) print('Retrieved {} links from "Special:RecentChanges"; {} of which new.'. format(starting_recent_links_num, len(recent_links))) return links
def test_dedup_quotes_across_adjacent_pages(): cross_page_quotes = [] cross_page_quotes += list(scrape.main(year=2018, month=1, page=1)) cross_page_quotes += list(scrape.main(year=2018, month=1, page=2)) quotes_counter = Counter() for quote in cross_page_quotes: quotes_counter[quote] += 1 #print('{} -- hashes to -- {}'.format( # repr(quote)[:10], # hash(quote) #)) fname = join(dirname(__file__), 'data/dedup_quotes_across_adjacent_pages.pprint') with open(fname, 'r') as f: expected = f.read() observed = pformat(quotes_counter, indent=4) assert expected == observed
def main(): # Update. scrape.main() # Add any new files. cmd = 'git add .' check_call(cmd, shell=True) # Commit changes. cmd = 'git commit -am"changes as of %s"' cmd = cmd % datetime.datetime.now().isoformat() check_call(cmd, shell=True) # Pull changes. cmd = 'git pull origin master' check_call(cmd, shell=True) # Push changes. cmd = 'git push origin master' check_call(cmd, shell=True)
def match_data(postalcode, year, id, sid): ''' Creates two csv files one with weather and one with solar panel data. ''' solar_file = str(postalcode) + "_" + str(year) + "_S.csv" weather_file = str(postalcode) + "_" + str(year) + "_W.csv" solarpanel = main(1, year, 1, year + 1, id, sid) weather = _main_(postalcode, year) weather = add_month(weather, year) solarpanel = solar_csv(solarpanel) weather = weather_csv(solarpanel, weather) weather.to_csv(weather_file) solarpanel.to_csv(solar_file)
def main(): workouts = scrape.main() email = construct_email(workouts) send_email(email)
# This doesn't make sense but was easiest import gcintegration import scrape import time scrape.main() time.sleep(20) gcintegration.main()
def setUpClass(self): cmd = "https://tabs.ultimate-guitar.com/tab/ed-sheeran/perfect-chords-1956589 -f test -i -c -a -l -j" scrape.main(cmd.split(" "))
import save_contents import scrape import get_contents links = [] with open('list_of_algorithms_link.txt', 'r') as f: links = f.readlines() for link in links: scrape.main(link) save_contents.main() get_contents.main()
def index(): """ Present the data in a RESTful API """ return jsonify(main())
def scrape_links(time_before_new_changed, title=None, links=None, unscraped_links_filename=os.path.join( '..', 'data', 'links', 'links_unscraped.txt'), done_links_filename=os.path.join( '..', 'data', 'links', 'done_links.txt')): """Scrape links from pages on candidate URLs and retrieve any synonyms.""" start_time = time.time() done_links = get_done_links(done_links_filename) if links == None: links = get_unscraped_links(unscraped_links_filename, done_links) syn_count = len(os.listdir(os.path.join('..', 'data', 'synonyms_new'))) print('Found {} synonym-files at start of while-loop.\n'.format(syn_count)) while links: if time.time() > start_time + time_before_new_changed: print('Time {} seconds exceeded; getting new changed links.'. format(time_before_new_changed)) links = get_recent_changes(links, done_links) start_time = time.time() title = links.pop() try: page, _, synonyms, new_links = S.main(title) except KeyboardInterrupt: print('''\nWe met with KeyboardInterrupt; title: {}. '''. format(title)) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback) return links, done_links except TypeError: # TypeError: 'NoneType' object is not iterable # Usually because "HTTP Error 404: Not Found", so restore title. # But temporarily we are leaving titles unrestored, as we think # some of these were previously unfiltered `redlink=1" cases. links.add(title) try: done_links.remove(title) except KeyError: pass print(' {}'.format(title)) continue except Exception: print('\nWe met with Exception; title: {}.'. format(title)) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback) print('\n') continue # Do not examine whether title in done_links; # would prevent utility of "get_recent_changes()". # Assume new links are checked only when received from S.main(). with open(done_links_filename, 'a') as f: f.write('\n' + title) if synonyms: _ = U.store_data( json.dumps(synonyms).encode(), title, target_dir='synonyms_new', tar=False) syn_count = len( os.listdir(os.path.join('..', 'data', 'synonyms_new'))) links, new_links, done_links = update_links( links, new_links, done_links, title) print('''T: {}; links: + {:>3} => {:>}; done: {} ({}%); ''' '''syn: + {} => {} ({}%);\n {}'''. format(int(time.time() - start_time), len(new_links), len(links), len(done_links), round( 100 * len(done_links) / (len(done_links) + len(links)), 1), len(synonyms), syn_count, round(100 * syn_count / len(done_links), 1), title)) # Uncomment the following line to save whole pages (compressed). # _ = U.store_data(page, title, target_dir='html_new', tar=True) # # Write the whole of "links": "title" removed, "new_links" added. # try: # with open(unscraped_links_filename, 'w') as f: # f.write('\n'.join(links)) # except KeyboardInterrupt: # print('''\nWe met with KeyboardInterrupt; title: {}. '''. # format(title)) # exc_type, exc_value, exc_traceback = sys.exc_info() # traceback.print_exception(exc_type, exc_value, exc_traceback) # return links, done_links return links, done_links
def scrape_links(time_before_new_changed, title=None, links=None, unscraped_links_filename=os.path.join('..', 'data', 'links', 'links_unscraped.txt'), done_links_filename=os.path.join('..', 'data', 'links', 'done_links.txt')): """Scrape links from pages on candidate URLs and retrieve any synonyms.""" start_time = time.time() done_links = get_done_links(done_links_filename) if links == None: links = get_unscraped_links(unscraped_links_filename, done_links) syn_count = len(os.listdir(os.path.join('..', 'data', 'synonyms_new'))) print('Found {} synonym-files at start of while-loop.\n'.format(syn_count)) while links: if time.time() > start_time + time_before_new_changed: print( 'Time {} seconds exceeded; getting new changed links.'.format( time_before_new_changed)) links = get_recent_changes(links, done_links) start_time = time.time() title = links.pop() try: page, _, synonyms, new_links = S.main(title) except KeyboardInterrupt: print('''\nWe met with KeyboardInterrupt; title: {}. '''.format( title)) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback) return links, done_links except TypeError: # TypeError: 'NoneType' object is not iterable # Usually because "HTTP Error 404: Not Found", so restore title. # But temporarily we are leaving titles unrestored, as we think # some of these were previously unfiltered `redlink=1" cases. links.add(title) try: done_links.remove(title) except KeyError: pass print(' {}'.format(title)) continue except Exception: print('\nWe met with Exception; title: {}.'.format(title)) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback) print('\n') continue # Do not examine whether title in done_links; # would prevent utility of "get_recent_changes()". # Assume new links are checked only when received from S.main(). with open(done_links_filename, 'a') as f: f.write('\n' + title) if synonyms: _ = U.store_data(json.dumps(synonyms).encode(), title, target_dir='synonyms_new', tar=False) syn_count = len( os.listdir(os.path.join('..', 'data', 'synonyms_new'))) links, new_links, done_links = update_links(links, new_links, done_links, title) print('''T: {}; links: + {:>3} => {:>}; done: {} ({}%); ''' '''syn: + {} => {} ({}%);\n {}'''.format( int(time.time() - start_time), len(new_links), len(links), len(done_links), round(100 * len(done_links) / (len(done_links) + len(links)), 1), len(synonyms), syn_count, round(100 * syn_count / len(done_links), 1), title)) # Uncomment the following line to save whole pages (compressed). # _ = U.store_data(page, title, target_dir='html_new', tar=True) # # Write the whole of "links": "title" removed, "new_links" added. # try: # with open(unscraped_links_filename, 'w') as f: # f.write('\n'.join(links)) # except KeyboardInterrupt: # print('''\nWe met with KeyboardInterrupt; title: {}. '''. # format(title)) # exc_type, exc_value, exc_traceback = sys.exc_info() # traceback.print_exception(exc_type, exc_value, exc_traceback) # return links, done_links return links, done_links
#!/bin/python3 # Used to call scrape.py by passing it a text file with multiple entries # and categories # File input should be <category> <url> Make sure a space is inbetween to parse correctly import sys import scrape # print(sys.argv[1]) infile = open(sys.argv[1], "r") count = 0 for item in infile: insplit = item.split(" ") category = insplit[0] url = insplit[1] # print(url) # print(category) scrape.main(url, category) count = count + 1 print(str(count) + " file(s) processed.")
# You should have received a copy of the GNU General Public License # along with roastcalc. If not, see <http://www.gnu.org/licenses/>. ### Define magic numbers ### safetyBuffer = 0.005 # Add this percent to roast input weight cupQuantity = 0.2 # Add to roast for production cupping emailBool = False # Email report or no? emails = ["*****@*****.**"] ### Import data from CSV files ### import scrape # Download the spreadsheet as CSV files import csv # Parse the CSV files import os # Run the email command # Download the files scrape.main() # Column indices for history.csv: # ID-Tag, Profile, Date, Component, Start Weight, End Weight, % Loss # 0, 1, 2, 3, 4, 5, 6 historyCSV = list(csv.reader(open("history.csv"))) # Column indices for products.csv: # Product, Component, Component #, Roast profile, % of blend # 1, 2, 3, 4, 5 productsCSV = list(csv.reader(open("products.csv"))) # List of available products uniqueProducts = set([product[0] for product in productsCSV]) # Column indices for totals.csv: # Product, 3oz, 12oz, 2lb, 5lb, total lbs, roast/don't roast [1/0]
import os; os.chdir(os.getenv("HOME")+"/Dropbox/Courses/MIT/1.2 LDA - Finding Themes in Project Description/MITprojects") #import gather #abs_dict=gather.main(stop=2) import json with open('abs_dict.json') as abs_dict_file: #import json from file abs_dict = json.load(abs_dict_file) import scrape scrape.main(abs_dict) scrape.tokenize() #Now run lda. in onlineMIT I coded path to ../MITprojects/ etc #This creates some .txt files #os.chdir(os.getenv("HOME")+"/Dropbox/Courses/MIT/1.2 LDA - Finding Themes in Project Description/onlineldavb") cd "/home/kinkyboy/Dropbox/Courses/MIT/1.2 LDA - Finding Themes in Project Description/onlineldavb" python onlineMIT.py 101 python printtopics.py dictnostops.txt lambda-100.dat #NaN #TODO #remove symbols from tokens #check wikirandom for regular expressions: saxon genitives should be removed: "an agent's" in 'Leslie Kaelbling_abs20.txt' #Notes #python onlinewikipedia.py 101; python printtopics.py dictnostops.txt lambda-100.dat #python printtopics.py dictnostops.txt lambda-10.dat #https://wellecks.wordpress.com/2014/10/26/ldaoverflow-with-online-lda/ #(wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) #printed output is in the return of do_e_step_docs (gamma, sstats) = self.do_e_step_docs(docs)