def insert_links(date, version, con): #Insert links into DB with con: links_cur = con.cursor() links_gen = links.get_links(date, version) for link in links_gen: links_cur.execute( "INSERT INTO links VALUES(%s, %s, %s, %s, %s, %s, %s)", link)
def backup_account(url='', org='', key='', account='', backupdir='', **kwargs): # create directory structure backup_dir = create_dir(os.getcwd(), backupdir) org_dir = create_dir(backup_dir, org) account_dir = create_dir(org_dir, account) # backup agents agent_dir = create_dir(account_dir, 'agents') for agent_json in agents.get_agents(url=url, org=org, account=account, key=key): agent_path = os.path.join(agent_dir, str(agent_json['name']) + '.json') remove_keys = ['presence_state', 'created', 'modified', 'heartbeat'] for k in remove_keys: if k in agent_json: del agent_json[k] with open(agent_path, 'w') as f: f.write(json.dumps(agent_json, indent=4)) # backup dashboards dashboard_dir = create_dir(account_dir, 'dashboards') for d in dashboards.get_dashboards(url=url, org=org, account=account, key=key): dashboard_path = os.path.join(dashboard_dir, str(d['name']) + '.yaml') with open(dashboard_path, 'w') as f: f.write(yaml.safe_dump(d, default_flow_style=False, explicit_start=True)) # backup plugins plugin_dir = create_dir(account_dir, 'plugins') for p in plugins.get_plugins(url=url, org=org, account=account, key=key): plugin_path = os.path.join(plugin_dir, str(p['name']) + '.' + str(p['extension'])) with open(plugin_path, 'w') as f: f.write(plugins.export_plugin(plugin=p['name'], url=url, org=org, account=account, key=key)) # backup rules rule_dir = create_dir(account_dir, 'rules') for r in rules.get_rules(url=url, org=org, account=account, key=key): rule_path = os.path.join(rule_dir, str(r['name']) + '.yaml') with open(rule_path, 'w') as f: rule_content = yaml.safe_load(rules.export_rule(rule=r['id'], url=url, org=org, account=account, key=key)) if rule_content['actions']: action_count = len(rule_content['actions']) for i in range(action_count): try: del rule_content['actions'][i]['details']['status'] except KeyError: continue f.write(yaml.safe_dump(rule_content, default_flow_style=False, explicit_start=True)) # backup links link_dir = create_dir(account_dir, 'links') for l in links.get_links(url=url, org=org, account=account, key=key): link_path = os.path.join(link_dir, l['id'] + '.json') link_json = links.export_link(link_id=l['id'], url=url, org=org, account=account, key=key) with open(link_path, 'w') as f: f.write(json.dumps(link_json, indent=4))
def test_get_links_code(self): text = read_file("code.js") self.assertEqual( [ "https://stackoverflow.com/a/57804949", "http://www.google.com", "http://www.mylink.com", "http://www.yourlink.com", "http://www.test.com", "http://www.facebook.com", ], get_links(text), )
from bs4 import BeautifulSoup import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) ## Get list of canceled/renewed TV shows http = urllib3.PoolManager() url = 'https://www.metacritic.com/feature/tv-renewal-scorecard-2017-2018-season' fp = http.request('GET', url) soup = BeautifulSoup(fp.data, features='lxml') data = soup.find_all('p', {'class': 'medium'}) func = functions.get_data() links = links.get_links() #links.get_IMDb_links('$100,000 Pyramid') # Parse webpage to create a list of canceled\renewed\rescued tv shows for text in data: if "has renewed" in text.text or "renewed for" in text.text: func.is_renewed(text) elif "canceled" in text.text: func.is_canceled(text) if "rescued" in text.text: func.is_rescued(text) # If show was rescued, remove from canceled list if there for i in func.removeFromCanceled: func.canceled = [
import links import DB_connect from datetime import date, timedelta from json import JSONDecoder Days = 1 while (Days < 2920): # 365*8 = 2920 as the website got news only till 2006 url = "http://www.bloomberg.com/archive/news/" d = date.today() - timedelta( days=Days ) # We start extracting data from a date before today due to time zone d = d.strftime('%Y-%m-%d') url = url + d + "/" list_links = links.get_links( url) # It gets Articles published on any single day total_links = len(list_links) # Total Articles published on a single day print "Total articles for date: " print d, " are: ", total_links #Fetches the Article Details by parsing each article page and then return the Data in JSON Format to be stored in DB for index in range(0, total_links): print list_links[index] data = links.get_articles(list_links[index]) data = JSONDecoder().decode(data) db = DB_connect.mongo_insert(data) # Data inserted in DB # After fetching all articles of a particular day the day is increased by one so the date shifts to a previous day.
def test_get_links(self): text = read_file("text.txt") self.assertEqual( ["http://www.example.org/bag.aspx", "http://example.com/bat.html"], get_links(text), )
import links import DB_connect from datetime import date, timedelta from json import JSONDecoder Days=1 while(Days < 2920): # 365*8 = 2920 as the website got news only till 2006 url="http://www.bloomberg.com/archive/news/" d=date.today()-timedelta(days=Days) # We start extracting data from a date before today due to time zone d=d.strftime('%Y-%m-%d') url=url+d+"/" list_links = links.get_links(url) # It gets Articles published on any single day total_links = len(list_links) # Total Articles published on a single day print "Total articles for date: " print d, " are: ", total_links #Fetches the Article Details by parsing each article page and then return the Data in JSON Format to be stored in DB for index in range(0,total_links): print list_links[index] data = links.get_articles(list_links[index]) data = JSONDecoder().decode(data) db=DB_connect.mongo_insert(data) # Data inserted in DB
def backup_account(url='', org='', key='', account='', backup_dir='', **kwargs): # create directory structure backup_dir = create_dir(os.getcwd(), backup_dir) org_dir = create_dir(backup_dir, org) account_dir = create_dir(org_dir, account) # backup agents agent_dir = create_dir(account_dir, 'agents') for agent in agents.get_agents(url=url, org=org, account=account, key=key): logging.debug('Exporting JSON for agent "%s"', agent['name']) # some agents can have a name 'http://...' encode name before writing a dir agent_path = os.path.join(agent_dir, str(urllib.quote(agent['name'], safe='')) + '.json') remove_keys = ['presence_state', 'created', 'modified', 'heartbeat'] for k in remove_keys: if k in agent: del agent[k] with open(agent_path, 'w') as f: f.write(json.dumps(agent, indent=4)) # backup dashboards dashboard_dir = create_dir(account_dir, 'dashboards') for dash in dashboards.get_dashboards(url=url, org=org, account=account, key=key): logging.debug('Exporting YAML for dashboard "%s"', dash['name']) dashboard_path = os.path.join(dashboard_dir, str(dash['name']) + '.yaml') with open(dashboard_path, 'w') as f: f.write(yaml.safe_dump(dash, default_flow_style=False, explicit_start=True)) # backup plugins plugin_dir = create_dir(account_dir, 'plugins') for plugin in plugins.get_plugins(url=url, org=org, account=account, key=key): logging.debug('Exporting plugin "%s"', plugin['name']) plugin_path = os.path.join(plugin_dir, str(plugin['name']) + '.' + str(plugin['extension'])) with open(plugin_path, 'w') as f: f.write(plugins.export_plugin(plugin=plugin['name'], url=url, org=org, account=account, key=key)) # backup rules rule_dir = create_dir(account_dir, 'rules') for rule in rules.get_rules(url=url, org=org, account=account, key=key): logging.debug('Exporting YAML for rule "%s" with id %s', rule['name'], rule['id']) rule_path = os.path.join(rule_dir, str(rule['name']) + '.yaml') with open(rule_path, 'w') as f: rule_yaml = rules.export_rule(rule=rule['id'], url=url, org=org, account=account, key=key) try: rule_content = yaml.safe_load(rule_yaml) if rule_content['actions']: action_count = len(rule_content['actions']) for i in range(action_count): try: del rule_content['actions'][i]['details']['status'] except KeyError: continue f.write(yaml.safe_dump(rule_content, default_flow_style=False, explicit_start=True)) except yaml.YAMLError as e: logging.warn('Unable to parse YAML for rule %s: %s', rule['name'], e.problem) f.write(rule_yaml) # backup links link_dir = create_dir(account_dir, 'links') for link in links.get_links(url=url, org=org, account=account, key=key): logging.debug('Exporting JSON for pack "%s" with id %s', link['plugin'], link['id']) link_path = os.path.join(link_dir, link['id'] + '.json') link_json = links.export_link(link_id=link['id'], url=url, org=org, account=account, key=key) with open(link_path, 'w') as f: f.write(json.dumps(link_json, indent=4))
def generate(self): self.out_text.delete(1.0, END) self.out_text.insert( END, "\n".join(get_links(self.input_text.get(1.0, END))))
# canonical names that don't # have a row in locations.tsv # have a label in latin letters # have a label in the tengwar # have a region in regions.svg from locations import locations as get_locations, CANONICAL, SINDARIN, ROMAN, OTHER locations = get_locations() from regions import regions2 as get_regions regions = get_regions() from links import links as get_links links = get_links() from names import names as get_names names = get_names() from labels import abnormal_labels as get_abnormal_labels, normalized_labels as get_normalized_labels normalized_labels = [label for name, label in get_normalized_labels()] abnormal_labels = [label for name, label in get_abnormal_labels()] postpone_canonical_names = set(name.strip() for name in open("postpone.txt")) def canonical_names(): canonical_names = set() source_pairs = ( # ('location', set(zip(*locations)[0])),
from bs4 import BeautifulSoup as bs import requests from links import get_links filename = 'timesofindia_tweets.csv' targets = get_links(filename) ''' format for data: { 'source' : 'times_of_india', 'title' : "", 'paragraphs':[""], 'publication':"<timestamp>", 'category':"", 'tags':"" } ''' for target in targets: row = { 'source': 'times_of_india', 'title': "", 'paragraphs': [""], 'publication': "", 'category': "", 'tags': "" } row['publication'] = target['datetime'] req = requests.get(target['link']) soup = bs(req.text, 'html.parser') heading = soup.find_all('h1') for i in heading: