def search(data): """ Search OMDB for an id based on a name/year. :param data: {name, year} :return: id """ name = data['name'] year = data['year'] # if we managed to parse the year from the name # include it, since it'll narrow results if year: year_query = '&y={}'.format(year.replace('(', '').replace(')', '')) else: year_query = '' try: result = requests.get(TMDB_SEARCH_URL + name + year_query).json() except: log.critical('There was a problem accessing the IMDB API page.') return None if 'results' in result: for movie in result['results']: ratio = difflib.SequenceMatcher( None, pynab.ids.clean_name(name), pynab.ids.clean_name(movie['title'])).ratio() if ratio > 0.8 and year in movie['release_date']: temp = requests.get('https://api.themoviedb.org/3/movie/{}'. format(movie['id']) + '?api_key=' + API_KEY).json() return temp['imdb_id'] return None
def search(data): """ Search OMDB for an id based on a name/year. :param data: {name, year} :return: id """ name = data['name'] year = data['year'] # if we managed to parse the year from the name # include it, since it'll narrow results if year: year_query = '&y={}'.format(year.replace('(', '').replace(')', '')) else: year_query = '' try: result = requests.get(OMDB_SEARCH_URL + name + year_query).json() except: log.critical('There was a problem accessing the IMDB API page.') return None if 'Search' in result: for movie in result['Search']: # doublecheck, but the api should've searched properly ratio = difflib.SequenceMatcher(None, pynab.ids.clean_name(name), pynab.ids.clean_name(movie['Title'])).ratio() if ratio > 0.8 and year == movie['Year'] and movie['Type'] == 'movie': return movie['imdbID'] return None
def search(data): """ Search OMDB for an id based on a name/year. :param data: {name, year} :return: id """ name = data['name'] year = data['year'] # if we managed to parse the year from the name # include it, since it'll narrow results if year: year_query = '&y={}'.format(year.replace('(', '').replace(')', '')) else: year_query = '' try: result = requests.get(OMDB_SEARCH_URL + name + year_query).json() except: log.critical('There was a problem accessing the IMDB API page.') return None if 'Search' in result: for movie in result['Search']: # doublecheck, but the api should've searched properly ratio = difflib.SequenceMatcher( None, pynab.ids.clean_name(name), pynab.ids.clean_name(movie['Title'])).ratio() if ratio > 0.8 and year == movie['Year'] and movie[ 'Type'] == 'movie': return movie['imdbID'] return None
def search(name, year): """Search OMDB for a movie and return the IMDB ID.""" # if we managed to parse the year from the name # include it, since it'll narrow results if year: year_query = '&y={}'.format(year.replace('(', '').replace(')', '')) else: year_query = '' data = {} try: r = requests.get(OMDB_SEARCH_URL + name + year_query) data = r.json() except: log.critical('There was a problem accessing the IMDB API page.') return None if 'Search' in data: for movie in data['Search']: # doublecheck, but the api should've searched properly ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie['Title'])).ratio() if ratio > 0.8 and year == movie['Year'] and movie['Type'] == 'movie': return movie
def search_lxml(show, content): """Search TVRage online API for show data.""" try: tree = etree.fromstring(content) except: log.critical('Problem parsing XML with lxml') return None matches = defaultdict(list) # parse show names in the same order as returned by tvrage, first one is usually the good one for xml_show in XPATH_SHOW(tree): for name in extract_names(xml_show): ratio = int(difflib.SequenceMatcher(None, show['clean_name'], clean_name(name)).ratio() * 100) if ratio == 100: return xmltodict.parse(etree.tostring(xml_show))['show'] matches[ratio].append(xml_show) # if no 100% is found, check highest ratio matches for ratio, xml_matches in sorted(matches.items(), reverse=True): for xml_match in xml_matches: if ratio >= 80: return xmltodict.parse(etree.tostring(xml_match))['show'] elif 80 > ratio > 60: if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match): if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)[0]): return xmltodict.parse(etree.tostring(xml_match))['show']
def process_movies(): try: return pynab.ids.process( 'movie', interfaces=config.postprocess.get('process_movies'), limit=500) except Exception as e: log.critical(traceback.format_exc()) raise Exception
def daemonize(pidfile): try: import traceback from daemonize import Daemonize daemon = Daemonize(app='pynab', pid=pidfile, action=main) daemon.start() except SystemExit: raise except: log.critical(traceback.format_exc())
def get_nzb_details(nzb): """Returns a JSON-like Python dict of NZB contents, including extra information such as a list of any nfos/rars that the NZB references.""" try: # using the html parser here instead of the straight lxml might be slower # but some of the nzbs spewed forth by newznab are broken and contain # non-xml entities, ie. ² # this breaks the normal lxml parser tree = html.fromstring(gzip.decompress(nzb.data)) except Exception as e: log.critical('nzbs: problem parsing XML with lxml: {}'.format(e)) return None nfos = [] sfvs = [] rars = [] pars = [] zips = [] rar_count = 0 par_count = 0 for file_subject in XPATH_FILE(tree): if rar_part_regex.search(file_subject): rar_count += 1 if nfo_regex.search( file_subject) and not metadata_regex.search(file_subject): nfos.append(filexml_to_dict(file_subject.getparent())) if sfv_regex.search(file_subject): sfvs.append(filexml_to_dict(file_subject.getparent())) if rar_regex.search( file_subject) and not metadata_regex.search(file_subject): rars.append(filexml_to_dict(file_subject.getparent())) if par2_regex.search(file_subject): par_count += 1 if not par_vol_regex.search(file_subject): pars.append(filexml_to_dict(file_subject.getparent())) if zip_regex.search( file_subject) and not metadata_regex.search(file_subject): zips.append(filexml_to_dict(file_subject.getparent())) return { 'nfos': nfos, 'sfvs': sfvs, 'rars': rars, 'pars': pars, 'zips': zips, 'rar_count': rar_count, 'par_count': par_count, }
def daemonize(pidfile): try: import traceback from daemonize import Daemonize fds = [] if log_descriptor: fds = [log_descriptor] daemon = Daemonize(app='pynab', pid=pidfile, action=main, keep_fds=fds) daemon.start() except SystemExit: raise except: log.critical(traceback.format_exc())
def get_nzb_details(nzb): """Returns a JSON-like Python dict of NZB contents, including extra information such as a list of any nfos/rars that the NZB references.""" try: # using the html parser here instead of the straight lxml might be slower # but some of the nzbs spewed forth by newznab are broken and contain # non-xml entities, ie. ² # this breaks the normal lxml parser tree = html.fromstring(gzip.decompress(nzb.data)) except Exception as e: log.critical('nzbs: problem parsing XML with lxml: {}'.format(e)) return None nfos = [] sfvs = [] rars = [] pars = [] zips = [] rar_count = 0 par_count = 0 for file_subject in XPATH_FILE(tree): if rar_part_regex.search(file_subject): rar_count += 1 if nfo_regex.search(file_subject) and not metadata_regex.search(file_subject): nfos.append(filexml_to_dict(file_subject.getparent())) if sfv_regex.search(file_subject): sfvs.append(filexml_to_dict(file_subject.getparent())) if rar_regex.search(file_subject) and not metadata_regex.search(file_subject): rars.append(filexml_to_dict(file_subject.getparent())) if par2_regex.search(file_subject): par_count += 1 if not par_vol_regex.search(file_subject): pars.append(filexml_to_dict(file_subject.getparent())) if zip_regex.search(file_subject) and not metadata_regex.search(file_subject): zips.append(filexml_to_dict(file_subject.getparent())) return { 'nfos': nfos, 'sfvs': sfvs, 'rars': rars, 'pars': pars, 'zips': zips, 'rar_count': rar_count, 'par_count': par_count, }
def get_size(nzb): """Returns the size of a release (in bytes) as given by the NZB, compressed.""" try: # using the html parser here instead of the straight lxml might be slower # but some of the nzbs spewed forth by newznab are broken and contain # non-xml entities, ie. ² # this breaks the normal lxml parser tree = html.fromstring(gzip.decompress(nzb.data)) except Exception as e: log.critical('nzbs: problem parsing XML with lxml: {}'.format(e)) return None size = 0 for bytes in XPATH_BYTES(tree): try: size += int(bytes) except: # too bad, there was a problem pass return size
def process_requests(): try: return pynab.requests.process(500) except Exception as e: log.critical(traceback.format_exc()) raise Exception
def process_movies(): try: return pynab.ids.process("movie", interfaces=config.postprocess.get("process_movies"), limit=500) except Exception as e: log.critical(traceback.format_exc()) raise Exception
def process_imdb(): try: return pynab.imdb.process(500) except Exception as e: log.critical(traceback.format_exc()) raise Exception
def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" if parts: start = time.time() group_name = list(parts.values())[0]['group_name'] with db_session() as db: # this is a little tricky. parts have no uniqueness at all. # no uniqid and the posted dates can change since it's based off the first # segment that we see in that part, which is different for each scan. # what we do is get the next-closest thing (subject+author+group) and # order it by oldest first, so when it's building the dict the newest parts # end on top (which are the most likely to be being saved to). # realistically, it shouldn't be a big problem - parts aren't stored in the db # for very long anyway, and they're only a problem while there. saving 500 million # segments to the db is probably not a great idea anyway. existing_parts = dict( ((part.hash, part) for part in db.query(Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by(Part.posted.asc()).all() ) ) part_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if not existing_part: segments = part.pop('segments') part_inserts.append(part) part['segments'] = segments if part_inserts: ordering = ['hash', 'subject', 'group_name', 'posted', 'posted_by', 'total_segments', 'xref'] s = io.StringIO() for part in part_inserts: for item in ordering: if item == 'posted': s.write('"' + part[item].replace(tzinfo=None).strftime('%Y-%m-%d %H:%M:%S').replace('"', '\\"') + '",') elif item == 'xref': # leave off the comma s.write('"' + part[item].encode('utf-8', 'replace').decode('utf-8').replace('"', '\\"') + '"') else: s.write('"' + str(part[item]).encode('utf-8', 'replace').decode().replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Part): return False s.close() db.close() with db_session() as db: existing_parts = dict( ((part.hash, part) for part in db.query(Part) .options( subqueryload('segments'), Load(Part).load_only(Part.id, Part.hash), Load(Segment).load_only(Segment.id, Segment.segment) ) .filter(Part.hash.in_(parts.keys())) .filter(Part.group_name == group_name) .order_by(Part.posted.asc()) .all() ) ) segment_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if existing_part: segments = dict(((s.segment, s) for s in existing_part.segments)) for segment_number, segment in part['segments'].items(): if int(segment_number) not in segments: segment['part_id'] = existing_part.id segment_inserts.append(segment) else: # we hit a duplicate message for a part # kinda wish people would stop reposting shit constantly pass else: log.critical( 'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?') return False if segment_inserts: ordering = ['segment', 'size', 'message_id', 'part_id'] s = io.StringIO() for segment in segment_inserts: for item in ordering: if item == 'part_id': # leave off the tab s.write('"' + str(segment[item]).replace('"', '\\"') + '"') else: s.write('"' + str(segment[item]).encode('utf-8', 'replace').decode('utf-8').replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Segment): return False s.close() db.close() end = time.time() log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format( len(part_inserts), len(segment_inserts), end - start )) del part_inserts[:] del segment_inserts[:] return True
def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" if parts: start = time.time() group_name = list(parts.values())[0]['group_name'] with db_session() as db: # this is a little tricky. parts have no uniqueness at all. # no uniqid and the posted dates can change since it's based off the first # segment that we see in that part, which is different for each scan. # what we do is get the next-closest thing (subject+author+group) and # order it by oldest first, so when it's building the dict the newest parts # end on top (which are the most likely to be being saved to). # realistically, it shouldn't be a big problem - parts aren't stored in the db # for very long anyway, and they're only a problem while there. saving 500 million # segments to the db is probably not a great idea anyway. existing_parts = dict(((part.hash, part) for part in db.query( Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by( Part.posted.asc()).all())) part_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if not existing_part: segments = part.pop('segments') part_inserts.append(part) part['segments'] = segments if part_inserts: ordering = [ 'hash', 'subject', 'group_name', 'posted', 'posted_by', 'total_segments', 'xref' ] s = io.StringIO() for part in part_inserts: for item in ordering: if item == 'posted': s.write('"' + part[item].replace( tzinfo=None).strftime( '%Y-%m-%d %H:%M:%S').replace('"', '\\"') + '",') elif item == 'xref': # leave off the comma s.write('"' + part[item].encode('utf-8', 'replace'). decode('utf-8').replace('"', '\\"') + '"') else: s.write('"' + str(part[item]).encode('utf-8', 'replace'). decode().replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Part): return False s.close() db.close() with db_session() as db: existing_parts = dict( ((part.hash, part) for part in db.query(Part).options( subqueryload('segments'), Load(Part).load_only(Part.id, Part.hash), Load(Segment).load_only(Segment.id, Segment.segment)). filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by( Part.posted.asc()).all())) segment_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if existing_part: segments = dict( ((s.segment, s) for s in existing_part.segments)) for segment_number, segment in part['segments'].items(): if int(segment_number) not in segments: segment['part_id'] = existing_part.id segment_inserts.append(segment) else: # we hit a duplicate message for a part # kinda wish people would stop reposting shit constantly pass else: log.critical( 'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?' ) return False if segment_inserts: ordering = ['segment', 'size', 'message_id', 'part_id'] s = io.StringIO() for segment in segment_inserts: for item in ordering: if item == 'part_id': # leave off the tab s.write('"' + str(segment[item]).replace('"', '\\"') + '"') else: s.write( '"' + str(segment[item]).encode('utf-8', 'replace'). decode('utf-8').replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Segment): return False s.close() db.close() end = time.time() log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format( len(part_inserts), len(segment_inserts), end - start)) del part_inserts[:] del segment_inserts[:] return True