def copy_variable_ref_to_graph(input_graph, output_graph, var_ref, init_value, scope=''): if scope != '': new_name = (scope + '/' + var_ref.name[:var_ref.name.index(':')]) else: new_name = var_ref.name[:var_ref.name.index(':')] collections = [] for name, collection in input_graph._collections.items(): if var_ref in collection: if (name == ops.GraphKeys.GLOBAL_VARIABLES or name == ops.GraphKeys.TRAINABLE_VARIABLES or scope == ''): collections.append(name) else: collections.append(scope + '/' + name) trainable = (var_ref in input_graph.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES)) with output_graph.as_default(): new_var = Variable(init_value, trainable, name=new_name, collections=collections, validate_shape=False) new_var.set_shape(init_value.shape) return new_var
def set_predefined_obj(self, key, obj): collections = [] c = self.collection while True: collections.append(c) c = c.parent if c is None: break collections.reverse() old_value = None value = None for collection in collections: attribute = Attribute(key) attribute.parent = self attribute.revise(obj) collection.attributes[key] = attribute if isinstance(obj.get_value(), Instance) or isinstance( obj.get_value(), FuncValue) or isinstance( obj.get_value(), ModuleValue): continue collection.inputs[attribute] = (attribute.get_obj(), attribute.get_obj().get_value(), attribute.get_obj().get_value(), attribute.get_obj().get_value())
def from_configuration( cls, name, crs, name_dir_pairs, glob_pattern='*.tif', img_collection_cls=ImageCollection, ): """ Creates a NestedImageCollection given the [collection name, directory] pairs. This is very convenient functionality for simple configuration level creation of this complex object. For example, to produce a nested collection of OS map tiles:: r = NestedImageCollection.from_configuration('os', ccrs.OSGB(), [['OS 1:1,000,000', '/directory/to/1_to_1m'], ['OS 1:250,000', '/directory/to/1_to_250k'], ['OS 1:50,000', '/directory/to/1_to_50k'], ], ) """ collections = [] for collection_name, collection_dir in name_dir_pairs: collection = img_collection_cls(collection_name, crs) collection.scan_dir_for_imgs(collection_dir, glob_pattern=glob_pattern) collections.append(collection) return cls(name, crs, collections)
def from_configuration(cls, name, crs, name_dir_pairs, glob_pattern='*.tif', img_collection_cls=ImageCollection): """ Creates a NestedImageCollection given the [collection name, directory] pairs. This is very convenient functionality for simple configuration level creation of this complex object. For example, to produce a nested collection of OS map tiles:: files = [['OS 1:1,000,000', '/directory/to/1_to_1m'], ['OS 1:250,000', '/directory/to/1_to_250k'], ['OS 1:50,000', '/directory/to/1_to_50k'], ] r = NestedImageCollection.from_configuration('os', ccrs.OSGB(), files, ) """ collections = [] for collection_name, collection_dir in name_dir_pairs: collection = img_collection_cls(collection_name, crs) collection.scan_dir_for_imgs(collection_dir, glob_pattern=glob_pattern) collections.append(collection) return cls(name, crs, collections)
def document_collection(resource, path, root_discovery, discovery, css=CSS): """Document a single collection in an API. Args: resource: Collection or service being documented. path: string, Dot separated name of the resource. root_discovery: Deserialized discovery document. discovery: Deserialized discovery document, but just the portion that describes the resource. css: string, The CSS to include in the generated file. """ collections = [] methods = [] resource_name = path.split(".")[-2] html = [ "<html><body>", css, "<h1>%s</h1>" % breadcrumbs(path[:-1], root_discovery), "<h2>Instance Methods</h2>", ] # Which methods are for collections. for name in dir(resource): if not name.startswith("_") and callable(getattr(resource, name)): if hasattr(getattr(resource, name), "__is_resource__"): collections.append(name) else: methods.append(name) # TOC if collections: for name in collections: if not name.startswith("_") and callable(getattr(resource, name)): href = path + name + ".html" html.append( string.Template(COLLECTION_LINK).substitute(href=href, name=name) ) if methods: for name in methods: if not name.startswith("_") and callable(getattr(resource, name)): doc = getattr(resource, name).__doc__ params = method_params(doc) firstline = doc.splitlines()[0] html.append( string.Template(METHOD_LINK).substitute( name=name, params=params, firstline=firstline ) ) if methods: html.append("<h3>Method Details</h3>") for name in methods: dname = name.rsplit("_")[0] html.append(method(name, getattr(resource, name).__doc__)) html.append("</body></html>") return "\n".join(html)
def from_configuration(cls, name, crs, name_dir_pairs, glob_pattern='*.tif', img_class=Img): """ Creates a :class:`~cartopy.io.img_nest.NestedImageCollection` instance given the list of image collection name and directory path pairs. This is very convenient functionality for simple configuration level creation of this complex object. For example, to produce a nested collection of OS map tiles:: files = [['OS 1:1,000,000', '/directory/to/1_to_1m'], ['OS 1:250,000', '/directory/to/1_to_250k'], ['OS 1:50,000', '/directory/to/1_to_50k'], ] r = NestedImageCollection.from_configuration('os', ccrs.OSGB(), files) .. important:: The list of image collection name and directory path pairs must be given in increasing resolution order i.e. from low resolution to high resolution. Args: * name: The name for the :class:`~cartopy.io.img_nest.NestedImageCollection` instance. * crs: The :class:`~cartopy.crs.Projection` of the image collection. * name_dir_pairs: A list of image collection name and directory path pairs. Kwargs: * glob_pattern: The image collection filename glob pattern. Defaults to '*.tif'. * img_class: The class of images created in the image collection. Returns: A :class:`~cartopy.io.img_nest.NestedImageCollection` instance. """ collections = [] for collection_name, collection_dir in name_dir_pairs: collection = ImageCollection(collection_name, crs) collection.scan_dir_for_imgs(collection_dir, glob_pattern=glob_pattern, img_class=img_class) collections.append(collection) return cls(name, crs, collections)
def from_configuration(cls, name, crs, name_dir_pairs, glob_pattern='*.tif', img_class=Img): """ Create a :class:`~cartopy.io.img_nest.NestedImageCollection` instance given the list of image collection name and directory path pairs. This is very convenient functionality for simple configuration level creation of this complex object. For example, to produce a nested collection of OS map tiles:: files = [['OS 1:1,000,000', '/directory/to/1_to_1m'], ['OS 1:250,000', '/directory/to/1_to_250k'], ['OS 1:50,000', '/directory/to/1_to_50k'], ] r = NestedImageCollection.from_configuration('os', ccrs.OSGB(), files) Parameters ---------- name The name for the :class:`~cartopy.io.img_nest.NestedImageCollection` instance. crs The :class:`~cartopy.crs.Projection` of the image collection. name_dir_pairs A list of image collection name and directory path pairs. glob_pattern: optional The image collection filename glob pattern. Defaults to '*.tif'. img_class: optional The class of images created in the image collection. Returns ------- A :class:`~cartopy.io.img_nest.NestedImageCollection` instance. Warnings -------- The list of image collection name and directory path pairs must be given in increasing resolution order i.e. from low resolution to high resolution. """ collections = [] for collection_name, collection_dir in name_dir_pairs: collection = ImageCollection(collection_name, crs) collection.scan_dir_for_imgs(collection_dir, glob_pattern=glob_pattern, img_class=img_class) collections.append(collection) return cls(name, crs, collections)
def _sift_tasks(mapping): tasks, collections = [], [] for name, value in list(mapping.items()): if _is_task(name, value): tasks.append(name) elif isMappingType(value): collections.append(name) tasks = sorted(tasks) collections = sorted(collections) return tasks, collections
def _sift_tasks(mapping): tasks, collections = [], [] for name, value in iteritems(mapping): if _is_task(name, value): tasks.append(name) elif isMappingType(value): collections.append(name) tasks = sorted(tasks) collections = sorted(collections) return tasks, collections
def _make_skos_collections(self): collections = [] for k, v in self.COLLECTIONS.items(): collections.append(( v["default_prefLabel"], v["fid"], self._make_skos_collection((k, v)), )) return self._load_template("collections." + self.outputformat).render( collections=collections)
def getCollectionList(): global fileList global fileCollections if debug: syslog.syslog("getCollectionList") getFileCollections() collections = [] for k in sorted(fileCollections.keys()): if debug: syslog.syslog("found collection:" + str(k)) collections.append(k) status = {'status': 'ok', 'collections': collections} rval = json.dumps(status) #if debug: syslog.syslog("getSoundList():"+rval) return rval
def addmodule(request): if request.method == "POST": form = forms.ModuleForm(request.POST) if form.is_valid(): module = form.cleaned_data["module"] collections = [] for i in form.cleaned_data['collections']: collections.append(get_object_or_404(models.Collection, pk=int(i))) jobs.create_module(module, collections) return HttpResponseRedirect(reverse('docserver-manager')) else: form = forms.ModuleForm() ret = {"form": form} return render(request, 'docserver/addmodule.html', ret)
def setup(): global currentCollection global collections global timeout global rootDir currentCollection = "" for d in Specs().s['collections']: collections.append(d) if debug: print(collections) currentCollection = collections.pop(0) if debug: print("currentCollection:", currentCollection['name']) timeout = time.time() + currentCollection['time']
def getCollectionList(): global fileList global fileCollections if debug: syslog.syslog("getCollectionList") flen = len(fileList) if flen == 0: createFileList() flen = len(fileList) collections = []; for k in sorted(fileCollections.keys()): if debug: syslog.syslog("found collection:"+str(k)) collections.append(k) status = { 'status' : 'ok' , 'collections' : collections } rval = json.dumps(status) #if debug: syslog.syslog("getSoundList():"+rval) return rval
def compact_range_dumps(li): """ Accepts a list of integers and represent it as intervals [1,2,3,4,6,7] => '1-4,6-7' """ li = sorted(li) low = None high = None collections = [] for i,number in enumerate(li): number = li[i] if low is None: low = number high = number elif high + 1 == number: high = number else: collections.append('{}-{}'.format(low, high)) low = None high = None collections.append('{}-{}'.format(low, high)) return ','.join(collections)
def query(query, uris=None, exact=False): if exact: raise ValueError('Exact queries not supported') terms = [] for key, values in (query.iteritems() if query else []): try: term = _QUERYMAP[key](values) except KeyError: raise ValueError('Keyword "%s" not supported' % key) else: terms.append(term) collections = [] for uri in uris or []: parts = uritools.urisplit(uri) if parts.path: collections.append(parts.path) elif not parts.query and not parts.fragment: pass # root URI? else: raise ValueError('Cannot search "%s"' % uri) if collections: terms.append('collection:(%s)' % ' OR '.join(collections)) return ' AND '.join(terms)
def query(query, uris=None, exact=False): if exact: raise ValueError("Exact queries not supported") terms = [] for key, values in query.items() if query else []: try: term = _QUERYMAP[key](values) except KeyError: raise ValueError('Keyword "%s" not supported' % key) else: terms.append(term) collections = [] for uri in uris or []: parts = uritools.urisplit(uri) if parts.path: collections.append(parts.path) elif not parts.query and not parts.fragment: pass # root URI? else: raise ValueError('Cannot search "%s"' % uri) if collections: terms.append("collection:(%s)" % " OR ".join(collections)) return " AND ".join(terms)
def DrawGLScene(): global lineList, showGrid showGrid = True glLoadIdentity() glOrtho(0, winWidth, winHeight, 0, 0.0, 100.0) glClearColor(1, 1, 1, 1) glClearDepth(1.0) glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) glColor3f(0, 0, 0) glLineWidth(4) gridSize = msHandler.getGridSize() if showGrid: glBegin(GL_LINES) for i in range(0, winHeight, gridSize): glVertex2f(i, 0) glVertex2f(i, winHeight) glVertex2f(0, i) glVertex2f(winHeight, i) glEnd() lineLists = msHandler.getLineLists() k = 0 # print("STARTING----------------") # print(lineLists[0]) # print(lineLists[1]) # print(lineLists[2]) all_graphs = [] for l in lineLists: Graph = MS_Graph(l) graph = Graph.getGraph() #graph = getGraph(lineList) graphs = Graph.BFS(graph) j = 0 for graph in graphs: all_graphs.append(graph) collections = [] for idx1, g1 in enumerate(all_graphs): # you have to use the vertices if idx1 == len(all_graphs) - 1: break for idx2, g2 in enumerate(all_graphs): if g1 == g2: pass if Graph.containsGraph(g1, g2) or Graph.containsGraph(g2, g1): collections.append((g1, g2)) print("\n\n\n", collections) for idx, collection in enumerate(collections): for graph in collection: vertices = np.array(list(graph)) glBegin(GL_LINES) if idx == 0 or idx == 10: # if it is 9*n, with n = 0, 1, 2, ..., n glColor3f(1.0, 0.0, 0.0) if idx == 1 or idx == 11: # if it is 1 + 10*n glColor3f(0.0, 1.0, 0.0) if idx == 2 or idx == 12: glColor3f(0.0, 0.0, 1.0) if idx == 3 or idx == 13: glColor3f(1.0, 1.0, 0.0) if idx == 4 or idx == 14: glColor3f(1.0, 0.0, 1.0) if idx == 5 or idx == 15: glColor3f(0.0, 1.0, 1.0) if idx == 6 or idx == 16: glColor3f(0.0, 0.5, 0.0) if idx == 7 or idx == 17: glColor3f(1.0, 0.5, 0.0) if idx == 8 or idx == 18: glColor3f(0.5, 1.0, 0.0) if idx == 9 or idx == 19: glColor3f(0.5, 0.0, 1.0) for i in range(len(vertices) - 1): if i == 0: glVertex2f(graph[-1][0], graph[-1][1]) glVertex2f(graph[i][0], graph[i][1]) glVertex2f(graph[i][0], graph[i][1]) glVertex2f(graph[i + 1][0], graph[i + 1][1]) glEnd() #print("I am printing collections\n",collections) #tmp = [tuple(tuple(j) for j in i) for i in lineList] #graph = nx.Graph(tmp); #j = 0 #for idx, graph in enumerate(nx.connected_components(graph)): #vertices = np.array(list(graph)) #glBegin(GL_LINES) #if j == 0: #glColor3f(1.0,0.0,0.0) #if j == 1: #glColor3f(0.0,1.0,0.0) #if j == 2: #glColor3f(0.0,0.0,1.0) #if j == 3: #glColor3f(1.0,1.0,0.0) #if j == 4: #glColor3f(1.0,0.0,1.0) #if j == 5: #glColor3f(0.0,1.0,1.0) #if j == 6: #glColor3f(0.0,0.5,0.0) #if j == 7: #glColor3f(1.0,0.5,0.0) #if j == 8: #glColor3f(0.5,1.0,0.0) #j+=1 #for i in range(len(vertices)-1): #glVertex2f(vertices[i][0],vertices[i][1]) #glVertex2f(vertices[i+1][0],vertices[i+1][1]) #glEnd() # # for i in range(len(lineList)-1): # dline = lineList[i] # glVertex2f(dline[0][0],dline[0][1]) # glVertex2f(dline[1][0],dline[1][1]) # glEnd() glutSwapBuffers()
def parse_series(self, response): print "Series" self.instancialize_database() if not self.check_logged(response): return self.log_in(response) update_id = None try: #Check if there is a dummy, if there is update only. If there inst the id will be none update_id = self.dbase.get_spider_item_id(response.url, 'entity') except ValueError as e: print "Error on getting dummy id on Series", e.message except: print "Error on getting dummy on Series", sys.exc_info()[0] util.PrintException() #Get romanized title romanized_title = response.css('span.releasestitle.tabletitle::text').extract() #Get description description = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(2)::text').extract() #Get webnovel link webnovel_link = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(2) a::attr(href)').extract() #Get type type = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(5)::text').extract() #Get titles associated_name = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(11)::text').extract() #Get people author_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(17) a::attr(href)').extract() author_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(17) a u::text').extract() author_alias_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(17)::text').extract() artist_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(20) a::attr(href)').extract() artist_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(20) a u::text').extract() artist_alias_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(20)::text').extract() #Get company original_publisher_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(26) a::attr(href)').extract() original_publisher_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(26) a u::text').extract() original_publisher_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(26)::text').extract() serialized_publisher = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(29) a::attr(href)').extract() serialized_publisher_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(29) a u::text').extract() serialized_publisher_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(29)::text').extract() english_publisher_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(35) a::attr(href)').extract() english_publisher_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(35) a u::text').extract() english_publisher_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(35)::text').extract() #Get year year = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(23)::text').extract() #Get related items. related = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(8) a::attr(href)').extract() related_text = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(8) a::text').extract() related_type = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(8)::text').extract() #Get status status = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(20)::text').extract() #Get animé comparative anime_start_end = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(26)::text').extract() #Get releases releases = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(17) a[rel=nofollow]::attr(href)').extract() #Get image images = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div center img::attr(src)').extract() #Get partial categories categories = response.css('li.tag_normal a::text').extract() try: #format romanized title romanized_title = util.sanitize_title(romanized_title[0]) #format description (synopsis) description = util.sanitize_content(description) descriptions = [] if(description): new_description = {} new_description['language_id'] = self.dbase.language_en new_description['content'] = description descriptions.append(new_description) #format titles titles = [] language_titles = [] for name in associated_name: new_name = util.sanitize_title(name) if(new_name): language = langid.classify(new_name) language_titles.append(language[0]) language_id = self.dbase.get_language_id_from_code(language[0]) new_title = {} new_title['title'] = new_name new_title['language_id'] = language_id titles.append(new_title) webnovel = False country_id = None language_id = None #format partial categories if(categories): categories = " ".join(categories) #format type type = util.sanitize_content(type) if(type == None): entity_type_id = self.dbase.entity_type_manga elif(type == 'Manga'): entity_type_id = self.dbase.entity_type_manga country_id = self.dbase.country_jp language_id = self.dbase.language_ja elif(type == 'Manhaw'): entity_type_id = self.dbase.entity_type_manhaw country_id = self.dbase.country_kr language_id = self.dbase.language_ko elif(type == 'Manhua'): entity_type_id = self.dbase.entity_type_manhua country_id = self.dbase.country_cn language_id = self.dbase.language_zh elif(type == 'Novel'): new_status = " ".join(status) #if is there is Web Volumes or Web Chapters in status if "Web Novel" in categories or re.search("Web",new_status) != None: entity_type_id = self.dbase.entity_type_webnovel webnovel = True else: entity_type_id = self.dbase.entity_type_lightnovel else: #Add new type: entity_type_id = self.dbase.add_type(type, 'entity') if(entity_type_id == None): entity_type_id = self.dbase.entity_type_manga #format people #remove [ from name #get author. If author don't exists create dummy author peoples = [] author_alias_text = [x for x in author_alias_text if x != ']'] relation_type_id = self.dbase.people_relation_type_writer for index, url in enumerate(author_url): add_dummy = False if 'add_author' in url: #Add dummy author try: people_name = util.get_formatted_name(util.sanitize_content(author_alias_text[index])) except IndexError as e: people_name = None if people_name: add_dummy = True else: util.Log(response.url, "Error on getting people name to insert dummy.", False) else: where_values = [] where_values.append(url) where_values.append('people') #Get author id from link. people_id = self.dbase.get_var('spider_item', ['id'], "url = %s and table_name = %s", where_values) people_name = util.get_formatted_name(util.sanitize_content(author_alias[index])) if(people_id == None): if people_name: add_dummy = True else: util.Log(response.url, "Error on getting people name to insert dummy on line 295.", False) else: where_values = [] where_values.append(people_name['name']) where_values.append(people_name['lastname']) where_values.append(people_id) alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values) if(alias_used_id == None): #Insert alias. alias_used_id = self.dbase.add_people_alias(people_name['name'], people_name['lastname'], people_id, self.dbase.alias_type_alias) if(add_dummy): people_country = self.dbase.country_jp people_id = self.dbase.create_people(people_name['name'], people_name['lastname'], people_country) where_values = [] where_values.append(people_name['name']) where_values.append(people_name['lastname']) where_values.append(people_id) alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values) self.dbase.add_spider_item('people', people_id, url) #print "Added dummy people" if people_id and alias_used_id: new_people = {} new_people['id'] = people_id new_people['alias_used_id'] = alias_used_id new_people['relation_type_id'] = relation_type_id peoples.append(new_people) else: util.Log(response.url, "Error on getting people name author to insert.", False) artist_alias_text = [x for x in artist_alias_text if x != ']'] relation_type_id = self.dbase.people_relation_type_illustrator for index, url in enumerate(artist_url): add_dummy = False if 'add_author' in url: #Add dummy author try: people_name = util.get_formatted_name(util.sanitize_content(artist_alias_text[index])) except IndexError as e: people_name = None if people_name: add_dummy = True else: util.Log(response.url, "Error on getting people name to insert dummy.", False) else: #Get author id from link. people_id = self.dbase.get_spider_item_id(url, 'people') people_name = util.get_formatted_name(util.sanitize_content(artist_alias[index])) if(people_id == None): if people_name: add_dummy = True else: util.Log(response.url, "Error on getting people name to insert dummy on line 350.", False) else: where_values = [] where_values.append(people_name['name']) where_values.append(people_name['lastname']) where_values.append(people_id) alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values) if(alias_used_id == None): #Insert alias. alias_used_id = self.dbase.add_people_alias(people_name['name'], people_name['lastname'], people_id, self.dbase.alias_type_alias) if(add_dummy): if(country_id != None): people_country = country_id else: people_country = self.dbase.country_jp people_id = self.dbase.create_people(people_name['name'], people_name['lastname'], people_country) where_values = [] where_values.append(people_name['name']) where_values.append(people_name['lastname']) where_values.append(people_id) alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values) self.dbase.add_spider_item('people', people_id, url) print "Added dummy people" if people_id and alias_used_id: new_people = {} new_people['id'] = people_id new_people['alias_used_id'] = alias_used_id new_people['relation_type_id'] = relation_type_id peoples.append(new_people) else: util.Log(response.url, "Error on getting people name artist to insert.", False) #format company companies = [] company_publisher = False original_publisher_text = [x for x in original_publisher_text if x != ']'] company_function_type_id = self.dbase.company_function_type_publisher #print original_publisher_url, original_publisher_alias, original_publisher_text if not "N/A" in original_publisher_url: for index, url in enumerate(original_publisher_url): add_dummy = False if 'add_publisher' in url: #Add dummy company company_name = util.sanitize_content(original_publisher_text[index]) add_dummy = True else: #Get author id from link. company_id = self.dbase.get_spider_item_id(url, 'company') company_name = util.sanitize_content(original_publisher_alias[index]) if(company_id == None): add_dummy = True else: where_values = [] where_values.append(company_name) where_values.append(company_id) alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values) #print "Alias used id", alias_used_id #Get country_id from original publisher if(country_id == None): where_values = [] where_values.append(company_id) country_id = self.dbase.get_var('company', ['country_id'], "id = %s", where_values) if(country_id == None): country_id = self.dbase.country_jp #Get language_id from original publisher if(language_id == None): language_id = self.dbase.get_language_from_country_id(country_id, self.dbase.language_ja) if(alias_used_id == None): #Insert alias. language = langid.classify(company_name) code = [] code.append(language[0]) language_id = self.dbase.get_var('language', ['id'], "code = %s", code) #print "Name :", company_name alias_used_id = self.dbase.add_alias(company_name, company_id, language_id, 'company', self.dbase.alias_type_alias) #print "Company name: ", company_name if(add_dummy): if(country_id != None): country_origin_id = country_id else: country_origin_id = self.dbase.country_jp if not language_id: language_id = self.dbase.language_ja company_id = self.dbase.create_company(company_name, language_id, country_origin_id, None, None, None, None, None, [], [], [], [], [], [], [], []) where_values = [] where_values.append(company_name) where_values.append(company_id) alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values) self.dbase.add_spider_item('company', company_id, url) print "Added dummy company" new_company = {} new_company['id'] = company_id new_company['function_type_id'] = company_function_type_id companies.append(new_company) company_publisher = True magazines = [] #Get serialized maganize: for index, magazine in enumerate(serialized_publisher_alias): magazines.append(magazine + " " + serialized_publisher_text[index]) magazines = util.sanitize_content(magazines) company_function_type_id = self.dbase.company_function_type_translator #Get english company: if not "N/A" in english_publisher_text: country_origin_id = self.dbase.country_us language_release = self.dbase.language_en for index, url in enumerate(english_publisher_url): add_dummy = False if 'add_publisher' in url: #Add dummy company company_name = util.sanitize_content(english_publisher_text[index]) add_dummy = True else: #Get author id from link. company_id = self.dbase.get_spider_item_id(url, 'company') company_name = util.sanitize_content(english_publisher_alias[index]) if(company_id == None): add_dummy = True else: where_values = [] where_values.append(company_name) where_values.append(company_id) alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values) if(alias_used_id == None): #Insert alias. #print "Name :", company_name alias_used_id = self.dbase.add_alias(company_name, company_id, language_release, 'company', self.dbase.alias_type_alias) #print "Company name: ", company_name if(add_dummy): company_id = self.dbase.create_company(company_name, language_release, country_origin_id, None, None, None, None, None, [], [], [], [], [], [], [], []) where_values = [] where_values.append(company_name) where_values.append(company_id) alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values) self.dbase.add_spider_item('company', company_id, url) print "Added dummy company" new_company = {} new_company['id'] = company_id new_company['function_type_id'] = company_function_type_id companies.append(new_company) create_webnovel_also = False if(webnovel and company_publisher): create_webnovel_also = True entity_type_id = self.dbase.entity_type_lightnovel #format year year = util.sanitize_content(year[0]) language = None #format country. Get country from associated name, if not found country_id will be Japan. if not country_id: language_country = {'ja': self.dbase.country_jp, 'ko': self.dbase.country_kr, 'zh': self.dbase.country_cn} language_test = {'ja': 0, 'ko': 0, 'zh' : 0} for title in titles: if title['language_id'] == self.dbase.language_ja: language_test['ja'] += 1 elif title['language_id'] == self.dbase.language_ko: language_test['ko'] += 1 elif title['language_id'] == self.dbase.language_zh: language_test['zh'] += 1 if(language_test['ja'] == language_test['ko'] and language_test['ko'] == language_test['zh']): language = language_country['ja'] else: language, value = max(language_test.iteritems(), key=lambda x: x[1]) if language in ['ja', 'ko', 'zh']: country_id = language_country[language] else: country_id = self.dbase.country_jp if not language_id: if language in ['ja', 'ko', 'zh']: languages = {'ja': self.dbase.language_ja, 'ko': self.dbase.language_ko, 'zh': self.dbase.language_zh} language_id = languages[language] else: language_id = self.dbase.language_ja #format related relateds = [] if(related): lenght_related_text = len(related_text) for index, item in enumerate(related): #Save dummy if not on database, if in database get id. dummy_series_id = self.dbase.get_spider_item_id(item, 'entity') collection_series_id = None if dummy_series_id == None: if index < lenght_related_text: dummy_name = util.sanitize_title(related_text[index]) else: dummy_name = None #Create dummy dummy_series_id = self.dbase.create_entity(dummy_name, self.dbase.entity_type_manga, self.dbase.classification_type_12, language_id, country_id) self.dbase.add_spider_item('entity', dummy_series_id, item) else: #Get collection from database: where_values = [] where_values.append(collection_series_id) collection_series_id = self.dbase.get_var('entity', ['collection_id'], "id = %s", where_values) #print "Dummy" , dummy_series_id new_related_type = util.sanitize_content(related_type[index]) if new_related_type: new_related_type = new_related_type.replace('(', '') new_related_type = new_related_type.replace(')', '') related_type_id = self.dbase.add_type(new_related_type, 'based') else: related_type_id = self.dbase.based_type_sequel_spinoff new_related = {} new_related['id'] = dummy_series_id new_related['type_id'] = related_type_id new_related['type_name'] = new_related_type new_related['collection_id'] = collection_series_id relateds.append(new_related) #Format images. The correct would be the edition have image and not entity. But mangaupdate don't save any related editions. #image = image[0] formatted_image = [] for image in images: image_array = image.split('.') new_image = {} new_image['url'] = image new_image['extension'] = image_array.pop() new_image_name = image_array.pop() new_image_name = new_image_name.split('/') new_image['name'] = new_image_name.pop() formatted_image.append(new_image) #Format related Doujinshi category_adult = False related_doujin = False #check if is doujinshi on title if(re.search(self.pattern_doujin, romanized_title) != None): #if is doujinshi, create a relation of doujinshi type. related_doujin = True category_adult = True #Get original from first part of " dj - " original_name = re.sub(self.pattern_remove_doujin, '', romanized_title) original_name = util.sanitize_title(original_name) if(original_name): where_values = [] where_values.append(original_name) original_id = self.dbase.get_var('entity_alias', ['entity_id'], "name = %s", where_values) else: original_name = 'Unkown name (Cralwer)' original_id = None if not original_id: #create dummy: original_id = self.dbase.create_entity(original_name, self.dbase.entity_type_manga, self.dbase.classification_type_12, language_id, country_id) self.dbase.add_spider_item('entity', original_id, 'Unknown') #format collection collection_id = None collection_started = 'False' #Get a collection from a related item only if type is prequel, sequel or spin-off. if related_doujin: #Get collection from original_name. if don't exists create collection. collection_id = self.dbase.create_collection(original_name) else: #if there is related items if relateds: #TODO: #Get collection from related items (Get from database because some other spider could make other item related with this). #Check if related is sequel, doujinshi or based on. If is there is a collection with the name of this entity. Collection will be the first part of the name. #Check if related is prequel, if is the collection is the name of prequel if there inst a collection on the prequel. #Update name of collection if there is more than one prequel. check recursive prequel. #Check which item started the collection. #if none found create collection from most used name. #Get name to make a new collection name. #self.dbase.get_related_item(self, table, first_field, second_field, relation_type, type_id, entity_id, limit = None) for item in relateds: if item['collection_id']: collection_id = item['collection_id'] break if not collection_id: #Check if name is similar to another collection already registered. Only check if name is larger then 3 characters. #This method can have mismatch collection names and collections will need to be check after all items was crawled using get_related_item. if(len(romanized_title) > 3): series_name = [] series_name.append(romanized_title) collection_id = self.dbase.get_col('collection', 'id', "%s LIKE '%%' || name || '%%'", series_name) if not collection_id: #create new collection with the first name type, get firstname part using regex. original_name = re.sub(self.pattern_replace_name,'',romanized_title) if not original_name: original_name = romanized_title collection_id = self.dbase.create_collection(original_name) elif(isinstance(collection_id, collections.Iterable) and not isinstance(collection_id, types.StringTypes)): #return the element most appear on list collections = [] for new_id in collection_id: collections.append(new_id[0]) collection_id = util.most_common_oneliner(collections) # Change this to use a relation on database. #format status status = util.sanitize_content(status) #format animé comparative anime_start_end = util.sanitize_content(anime_start_end) #format classification_type_id if(categories): if re.search(ur'[Mm]ature', categories) != None or re.search(ur'[Aa]dults?', categories) != None or re.search(ur'[Hh]entais?', categories) != None or re.search(ur'[Dd]oujin([ -]?shi)?s?', categories) != None or re.search(ur'[Ss]einens?', categories) != None: category_adult = True if(not category_adult): classification_type_id = self.dbase.classification_type_12 else: classification_type_id = self.dbase.classification_type_18
def _extract_collections(self): """Extracts standard SKOS Collection metadata""" collections = [] # TODO: handle OrderedCollections for s in self.G.subjects(predicate=RDF.type, object=SKOS.Collection): collections.append(str(s)) # keeping the OrderedDict ordered for c in sorted(collections): self.COLLECTIONS[c] = {} # fill in each Collection's details from the graph for c in self.COLLECTIONS.keys(): s = URIRef(c) # for use in Graph() loops self.COLLECTIONS[c]["fid"] = None self.COLLECTIONS[c]["default_prefLabel"] = None self.COLLECTIONS[c]["prefLabels"] = set() self.COLLECTIONS[c]["altLabels"] = set() self.COLLECTIONS[c]["definitions"] = set() self.COLLECTIONS[c]["scopeNotes"] = set() self.COLLECTIONS[c]["source"] = None self.COLLECTIONS[c]["members"] = set() for p, o in self.G.predicate_objects(subject=s): if p == SKOS.prefLabel: self.COLLECTIONS[c]["prefLabels"].add((str(o), o.language)) # TODO: add in language if o.language == self.default_language: self.COLLECTIONS[c]["default_prefLabel"] = str(o) elif p == SKOS.altLabel: self.COLLECTIONS[c]["altLabels"].add(str(o)) # TODO: add in language elif p == SKOS.definition: self.COLLECTIONS[c]["definitions"].add(str(o)) # TODO: add in language elif p == SKOS.scopeNote: self.COLLECTIONS[c]["scopeNotes"].add(str(o)) # TODO: add in language elif p == DCTERMS.source: self.COLLECTIONS[c]["source"] = str(o) elif p == SKOS.topConceptOf: self.COLLECTIONS[c]["topConceptOfs"].add(str(o)) elif p == SKOS.member: self.COLLECTIONS[c]["members"].add(str(o)) # TODO: handle members that are other Collections, not Concepts # listify the sets self.COLLECTIONS[c]["prefLabels"] = list(self.COLLECTIONS[c]["prefLabels"]) self.COLLECTIONS[c]["altLabels"] = list(self.COLLECTIONS[c]["altLabels"]) self.COLLECTIONS[c]["definitions"] = list(self.COLLECTIONS[c]["definitions"]) self.COLLECTIONS[c]["scopeNotes"] = list(self.COLLECTIONS[c]["scopeNotes"]) self.COLLECTIONS[c]["members"] = list(self.COLLECTIONS[c]["members"]) # make fid # TODO: update to use default language label, not [0] try: pl = self.COLLECTIONS[c]["prefLabels"][0][0] self.COLLECTIONS[c]["fid"] = self._make_fid( pl, c ) except Exception as e: print(e) raise Exception("You Collection {} doesn't have a label but it needs one!".format(c))
def parse_goods(self, response, media = False, figure = False): self.instancialize_database() if not self.check_logged(response): return self.log_in(response.url) if media: print "Media" elif figure: print "Figure" else: print "Goods" update_id = None try: #Check if there is a dummy, if there is update only. If there inst the id will be none if media: #Get update id from entity or from soundtrack or album update_id = self.dbase.get_spider_item_id_from_url(response.url) else: update_id = self.dbase.get_spider_item_id(response.url, 'goods') except ValueError as e: if media: print "Error on getting dummy id on Media", e.message else: print "Error on getting dummy id on Goods", e.message except: if media: print "Error on getting dummy on Media", sys.exc_info()[0] else: print "Error on getting dummy on Goods", sys.exc_info()[0] util.PrintException() #Get title title = response.css('#wide > h1 span[itemprop="name"]::attr(title)').extract() #Get details list details_list = response.css('ul.sd:nth-child(1) li') #Get release list release_list = response.css('#ref-releases + ul li') #Get pictures page. Get url from ID. #image_link = response.css('.tab > li:nth-child(6) a::attr(href)').extract() main_picture = response.css('.db-picture img::attr(src)').extract() #Get tags tags = response.css('.tags a:not([title=Information]):not([title=Yes]):not([title=No])::attr(title)').extract()#Exclude No, yes, information #Get related items related_url = response.css('ul.item:nth-child(14) li a::attr(href)').extract() related_text = response.css('ul.item:nth-child(14) li a::text').extract() related_type = response.css('ul.item:nth-child(14) li em::text').extract()#Only use if related_type and url have the same amount. #Get observation observations = response.css('div.msg::text').extract() if not observations: observations = response.css('div.msg div::text').extract() try: #Format title title = util.sanitize_title(title) #Format details list price, id, scale_id, release_date, scale = None, None, None, None, None versions_id, categories_id, comments, companies, materials, artists, personas, entities = [], [], [], [], [], [], [], [] counterfeit, cast_off, r18, region_free = False, False, False, False weight, width, length, height = None, None, None, None for item in details_list: new_item = item.css('label::text').extract() new_item = util.sanitize_title(new_item) new_content = item.css('div') if new_item and new_content: #print new_item #Check ID if new_item == "ID": id = new_content.css('::text').extract() id = util.sanitize_title(id) if id: id = id.replace('#','') #Check price elif re.search(self.pattern_price, new_item) != None: price = new_content.css('::text').extract() price = util.sanitize_title(price) if price: price = price.replace(ur'¥','') #print price #Check category elif re.search(self.pattern_categories, new_item) != None: #Figure category: Prepained, Action/Dolls, Trading, Garage Kits, Model Kits, Acessories categories = new_content.css('a::text').extract() for category in categories: category_name = util.sanitize_title(category) if category_name: if media: #Save category, return category id. category_id = self.dbase.add_name_to_table(category_name.title(), 'category') categories_id.append(category_id) else: categories_id.append(category_name.title()) #Check classification elif re.search(self.pattern_classification, new_item) != None: classifications = new_content.css('span.trigger > a::text').extract() for classification in classifications: #save comment on database. new_comment = {} new_comment['title'] = 'Crawler classification' new_comment['content'] = util.sanitize_content(classification) comments.append(new_comment) #Company elif re.search(self.pattern_companies, new_item) != None: new_companies = new_content.css('span.trigger') for company in new_companies: new_aliases = [] company_name = company.css('a:nth-child(1)::text').extract() company_name = util.sanitize_title(company_name) if company_name: language_code = langid.classify(company_name) language_id = self.dbase.get_language_id_from_code(language_code[0]) if not language_id: language_id = self.dbase.language_ja new_name = {} new_name['name'] = company_name new_name['language_id'] = language_id new_aliases.append(new_name) company_original_name = company.css('a:nth-child(1)::attr(switch)').extract() company_original_name = util.sanitize_title(company_original_name) if company_original_name: language_code = langid.classify(company_original_name) language_id = self.dbase.get_language_id_from_code(language_code[0]) if not language_id: language_id = self.dbase.language_ja new_name = {} new_name['name'] = company_original_name new_name['language_id'] = language_id new_aliases.append(new_name) company_url = company.css('a:nth-child(1)::attr(href)').extract() company_url = company_url[0] company_type = company.css('small::text').extract() company_type = util.sanitize_title(company_type) if company_type: company_type = re.sub(self.pattern_asps, '', company_type) company_type_id = self.dbase.add_type(company_type, 'company_function') else: company_type_id = self.dbase.company_function_type_creator company_id = self.dbase.get_spider_item_id(company_url, 'company') if not company_id: #Get company from alias where_values = [] where = [] for alias in new_aliases: where_values.append(alias['name']) where.append("name = %s") where = " or ".join(where) company_id = self.dbase.get_var('company_alias',['company_id'], where, where_values) if not company_id: #Create dummy alternate_names = [] if len(new_aliases) > 1: alternate_names = new_aliases[1:] company_id = self.dbase.create_company(new_aliases[0]['name'], self.dbase.language_ja, self.dbase.country_jp, None, None, None, None, None, [], [], [], [], [], [], [], alternate_names) self.dbase.add_spider_item('company', company_id, company_url) else: #Add alias for alias in new_aliases: self.dbase.add_alias(alias['name'], company_id, alias['language_id'], 'company', self.dbase.alias_type_alias) new_company = {} new_company['id'] = company_id new_company['function_type_id'] = company_type_id companies.append(new_company) #Character elif re.search(self.pattern_character, new_item): anchor_texts = new_content.css('span.trigger > a::text').extract() anchor_urls = new_content.css('span.trigger > a::attr(href)').extract() anchor_switchs = new_content.css('span.trigger > a::attr(switch)').extract()#original name for index, anchor_text in enumerate(anchor_texts): aliases = [] anchor_text = util.sanitize_title(anchor_text) if anchor_text: aliases.append(anchor_text) anchor_alias = util.sanitize_title(anchor_switchs[index]) if anchor_alias: aliases.append(anchor_alias) #Get id from spider_item persona_id = self.dbase.get_spider_item_id(anchor_urls[index], 'persona') if aliases: persona = {} persona['alias'] = aliases persona['id'] = persona_id personas.append(persona) #Origin elif re.search(self.pattern_origin, new_item): anchor_texts = new_content.css('span.trigger > a::text').extract() anchor_urls = new_content.css('span.trigger > a::attr(href)').extract() anchor_switchs = new_content.css('span.trigger > a::attr(switch)').extract()#original name for index, anchor_text in enumerate(anchor_texts): aliases = [] #Get id from spider_item entity_id = self.dbase.get_spider_item_id_from_url(anchor_urls[index]) anchor_text = util.sanitize_title(anchor_text) if anchor_text: aliases.append(anchor_text) anchor_alias = util.sanitize_title(anchor_switchs[index]) if anchor_alias: aliases.append(anchor_alias) if aliases: entity = {} entity['alias'] = aliases entity['id'] = entity_id entities.append(entity) #Check Dimensions elif re.search(self.pattern_dimensions, new_item): dimensions = new_content.css('::text').extract() dimensions = util.sanitize_title(dimensions) if dimensions: new_dimensions = re.sub(self.pattern_inside_paren, '--', dimensions) dimen = new_dimensions.split('--') for d in dimen: multi = 1 if re.search(self.pattern_m, d) != None: multi = 1000 elif re.search(self.pattern_cm, d) != None: multi = 10 new_d = re.sub(self.pattern_dimension, '', d) if re.search(self.pattern_w, new_d) != None: width = re.sub(self.pattern_alpha,'', new_d) width = util.convert_to_number(width) * multi elif re.search(self.pattern_h, new_d) != None: height = re.sub(self.pattern_alpha,'', new_d) height = util.convert_to_number(height) * multi elif re.search(self.pattern_l, new_d) != None: length = re.sub(self.pattern_alpha,'', new_d) length = util.convert_to_number(length) * multi #Register dimensions as comment. new_comment = {} new_comment['title'] = 'Crawler item dimensions' new_comment['content'] = dimensions comments.append(new_comment) #check version elif re.search(self.pattern_version, new_item) != None: versions = new_content.css('span.trigger > a::text').extract() for version in versions: version_id = util.sanitize_title(version) if version_id: if re.search(ur'[Rr]18', version_id) != None: r18 = True version_id = re.sub(ur'[Vv]er.?$', '', version_id) version_id = self.dbase.add_name_to_table(version_id.title(), 'goods_version') versions_id.append(version_id) #Check release date elif re.search(self.pattern_release, new_item) != None: release_date = new_content.css('a::text').extract() if release_date: release_date = release_date[0] #Scale elif new_item == "Scale": scale = new_content.css('span.trigger > a::text').extract() scale = util.sanitize_title(scale) if scale: scale_id = self.dbase.add_name_to_table(scale, 'scale') #Materials elif new_item == "Material": material = new_content.css('span.trigger > a::text').extract() material = util.sanitize_title(material) if material: material_id = self.dbase.add_name_to_table(material, 'material') materials.append(material_id) #Artist (People) elif re.search(self.pattern_artist, new_item): new_artists = new_content.css('span.trigger') for artist in new_artists: new_aliases = [] artist_name = artist.css('a:nth-child(1)::text').extract() artist_name = util.sanitize_title(artist_name) if artist_name: new_name = util.get_formatted_name(artist_name) if new_name: new_aliases.append(new_name) artist_original_name = artist.css('a:nth-child(1)::attr(switch)').extract() artist_original_name = util.sanitize_title(artist_original_name) if artist_original_name: new_name = util.get_formatted_name(artist_original_name) if new_name: new_aliases.append(new_name) artist_url = artist.css('a:nth-child(1)::attr(href)').extract() artist_url = artist_url[0] artist_type = artist.css('small::text').extract() artist_type = util.sanitize_title(artist_type) if artist_type: artist_type = re.sub(self.pattern_asps, '', artist_type) artist_type_id = self.dbase.add_type(artist_type, 'create') else: artist_type_id = self.dbase.people_create_type_sculptor artist_id = self.dbase.get_spider_item_id(artist_url, 'people') if not artist_id: #Get artist from alias where_values = [] where = [] for alias in new_aliases: where_values.append(alias['name']) where_values.append(alias['lastname']) where.append("name = %s") where.append("lastname = %s") where = " or ".join(where) artist_id = self.dbase.get_var('people_alias',['people_id'], where, where_values) if not artist_id: #Create dummy alternate_names = [] if len(new_aliases) > 1: alternate_names = new_aliases[1:] artist_id = self.dbase.create_people(new_aliases[0]['name'], new_aliases[0]['lastname'], self.dbase.country_jp, None, None, None, None, None, None, None, alternate_names) self.dbase.add_spider_item('people', artist_id, artist_url) else: #Add alias for alias in new_aliases: self.dbase.add_people_alias(alias['name'], alias['lastname'], artist_id, self.dbase.alias_type_alias) where_values = [] where_values.append(artist_id) where_values.append(new_aliases[0]['name']) where_values.append(new_aliases[0]['lastname']) artist_alias_used = self.dbase.get_var('people_alias', ['id'], "people_id = %s and name = %s and lastname = %s", where_values) new_artist = {} new_artist['id'] = artist_id new_artist['alias_id'] = artist_alias_used new_artist['function_type_id'] = artist_type_id artists.append(new_artist) #Various. Types: 2140g, Counterfeit; 14 tracks, 1 disc, 01:17:00; 1 disc; Region-free; Cast off; elif new_item == "Various": content = "" text = new_content.css('span.trigger > a span::text').extract() text = util.sanitize_content(text) if text: content = text text = new_content.css('span::text').extract() text = util.sanitize_content(text) if text: content = content + " " + text text = new_content.css('a::text').extract() text = util.sanitize_content(text) if text: content = content + " " + text text = new_content.css('::text').extract() text = util.sanitize_content(text) if text: content = content + " " + text if content: if re.search(self.pattern_counterfeit, content) != None: counterfeit = True if re.search(self.pattern_cast_off, content) != None: cast_off = True if re.search(self.pattern_r18, content) != None: r18 = True #Check if is region free if re.search(self.pattern_free_region, content) != None: region_free = True new_comment = {} new_comment['title'] = 'Crawler new_item various' new_comment['content'] = content comments.append(new_comment) entities_origin_id = [] persona_origin_id = [] #Format character and origin for entity in entities: #Format first entities if entity['id']: entities_origin_id.append(entity['id']) else: #Check if there is a entity origin with the same name, else create dummy. where, where_values = [], [] for alias in entity['alias']: where.append("name = %s") where_values.append(alias) where = " or ".join(where) return_items = self.dbase.get_col('entity_alias','entity_id', where, where_values) if return_items: if len(return_items) > 1: for return_item in return_items: new_comment = {} new_comment['title'] = 'Cralwer Not Associated Entity' new_comment['content'] = return_item[0] comments.append(new_comment) else: entities_origin_id.append(return_items[0][0]) else: #Create dummy. entity_dummy_id = self.dbase.create_entity(entity['alias'][0], self.dbase.entity_type_anime, self.dbase.classification_type_12, self.dbase.language_ja, self.dbase.country_jp) entities_origin_id.append(entity_dummy_id) for persona in personas: if persona['id']: persona_origin_id.append(persona['id']) else: #Check if there is a entity origin with the same name, else create dummy. persona_name = util.get_formatted_name(persona['alias']) if persona_name: where_values = [] where_values.append(persona_name['name']) where_values.append(persona_name['lastname']) return_items = self.dbase.get_col('persona_alias','persona_id', "name = %s and last_name = %s", where_values) if not return_items: persona_name = util.get_formatted_name(persona['alias'], True) where_values = [] where_values.append(persona_name['name']) where_values.append(persona_name['lastname']) return_items = self.dbase.get_col('persona_alias','persona_id', "name = %s and last_name = %s", where_values) if return_items: if len(return_items) > 1: for return_item in return_items: new_comment = {} new_comment['title'] = 'Cralwer Not Associated Persona' new_comment['content'] = return_item[0] comments.append(new_comment) else: persona_origin_id.append(return_items[0][0]) else: persona_name = util.get_formatted_name(persona['alias']) #Create dummy. entity_dummy_id = self.dbase.create_persona(persona_name['name'], persona_name['lastname'], 'Undefined') persona_origin_id.append(entity_dummy_id) #If there is more than one dont associate. ''' #Format relationship between good and entity. if len() == 1 or len() == 1: #make a relation between character and entity. else: #Dont make relationship, add comment. new_comment = {} new_comment['title'] = 'Cralwer Could not associated Persona' new_comment['content'] = return_item comments.append(new_comment) #Make relationship between entities entity_length = len(entities) ''' if entities_origin_id: new_comment = {} new_comment['title'] = 'Cralwer Could not associated to entities' new_comment['content'] = util.sanitize_title(entities_origin_id) comments.append(new_comment) #Format picture link new_images = [] for image in main_picture: image = re.sub(ur'\?.*','', image) image = re.sub(ur'\bbig\b/','large/', image) image_array = image.split('.') new_image = {} new_image['url'] = image new_image['extension'] = image_array.pop() new_image_name = image_array.pop() new_image_name = new_image_name.split('/') new_image['name'] = new_image_name.pop() new_image['image_type_id'] = self.dbase.image_good_type_main new_images.append(new_image) #Format tags tags_id = [] for tag in tags: tag_id = util.get_formatted_tag(tag) if tag_id: tag_id = self.dbase.add_name_to_table(tag, 'tag') tags_id.append(tag_id) #Format release list #format release country and currency.release_date launch_countries = [] for release in release_list: date = release.css('div:nth-child(1) .time::text').extract() date = util.sanitize_title(date) date = util.get_formatted_date(date) launch_type = release.css('div:nth-child(2) em::text').extract() if launch_type: launch_type = util.sanitize_title(launch_type[0]) launch_type = self.dbase.add_name_to_table(launch_type, 'launch_type') if not launch_type: launch_type = self.dbase.add_name_to_table('Standard', 'launch_type') price = release.css('div:nth-child(4)::text').extract() price = util.sanitize_title(price) if price: price = price.replace(ur'¥', '') if not price: price = 0 new_launch = {} new_launch['country_id'] = self.dbase.country_jp new_launch['date'] = date new_launch['price'] = price new_launch['currency_id'] = self.dbase.currency_yen new_launch['launch_type_id'] = launch_type #Get event location = util.sanitize_title(release.css('div:nth-child(3) span::attr(title)').extract()) if location: new_comment = {} new_comment['title'] = 'Crawled location launch' new_comment['content'] = location + " type: " + str(launch_type) + ", " + date comments.append(new_comment) launch_countries.append(new_launch) #Format collection collection_id = None #Get collection from entities of origin. for origin in entities_origin_id: where_values = [] where_values.append(origin) collection_id = self.dbase.get_var('entity', ['collection_id'], "id = %s", where_values) if collection_id: break #Get collection same name as origin entities_origin_name = [] if not collection_id: for item in entities: for alias in item['alias']: new_alias = util.normalize_collection_name(alias) if new_alias: where_values = [] where_values.append(new_alias) collection_id = self.dbase.get_var('collection_alias', ['collection_id'], "name = %s", where_values) if not collection_id: entities_origin_name.append(new_alias) else: break if collection_id: break #Get collection name from similar collections, from the title: if not collection_id and len(title) > 3: #Check if name is similar to another collection already registered. Only check if name is larger then 3 characters. #This method can have mismatch collection names and collections will need to be check after all items was crawled using get_related_item. series_name = [] series_name.append(title) collection_id = self.dbase.get_col('collection_alias', 'collection_id', "%s LIKE '%%' || name || '%%'", series_name) if not collection_id: collection_id = self.dbase.get_col('collection_alias', 'collection_id', "name LIKE '%%' || %s || '%%'", series_name) if not collection_id: #create new collection with the first name type, get firstname part using regex. original_name = re.sub(self.pattern_replace_name,'',title) if original_name: collection_id = self.dbase.create_collection(original_name) if(isinstance(collection_id, collections.Iterable) and not isinstance(collection_id, types.StringTypes)): #return the element most appear on list collections = [] for new_id in collection_id: collections.append(new_id[0]) collection_id = util.most_common_oneliner(collections) #Else create collection from origin name. If there is no origin name get collection from classification. if not collection_id: if entities_origin_name: collection_id = self.dbase.create_collection(entities_origin_name[0]) #Format classification if cast_off or r18: classification_type_id = self.dbase.classification_type_18 else: classification_type_id = self.dbase.classification_type_16 #Format observation observation = util.sanitize_content(observations) if not media: #Format good type if categories_id: goods_type_id = self.dbase.add_type(categories_id[0], 'goods') else: goods_type_id = self.dbase.add_type('Unkown', 'goods') #Format figure itens if figure: #Format scale if not scale: scale_id = self.dbase.scale_non_scale else: scale_id = self.dbase.add_name_to_table(scale, 'scale') #Format related items relations = [] for index, type in enumerate(related_type): #Get id, else create dummy. if media: table = 'entity' else: table = 'goods' relation_id = self.dbase.get_spider_item_id(related_url[index], table) if not relation_id: #Create dummy if media: relation_id = self.dbase.create_entity(None, entity_type_id, classification_type_id, self.dbase.language_ja, self.dbase.country_jp) else: relation_id = self.dbase.create_goods(None, self.dbase.language_ja, goods_type_id, None, collection_id) self.dbase.add_spider_item(table, relation_id, related_url[index]) type = util.sanitize_title(type) type_id = self.dbase.add_type(type.title(),'associated') new_relation = {} new_relation['id'] = relation_id new_relation['type_id'] = type_id relations.append(new_relation) #Format Draft. if "This entry is a draft" in response.body: draft = True else: draft = False
def parse_series(self, response, franchise=False): self.instancialize_database() if franchise: print "Franchise" else: print "Series" update_id = None try: #Check if there is a dummy, if there is update only. If there inst the id will be none if franchise: update_id = self.dbase.get_spider_item_id( response.url, 'collection') else: update_id = self.dbase.get_spider_item_id( response.url, 'entity') except ValueError as e: if franchise: print "Error on getting dummy id on Franchise", e.message else: print "Error on getting dummy id on Series", e.message except: if franchise: print "Error on getting dummy on Franchise", sys.exc_info()[0] else: print "Error on getting dummy on Series", sys.exc_info()[0] util.PrintException() #Get search title/collection name series_title = response.css( 'div.middleframe:nth-child(3) > h1:nth-child(1)::text').extract() #Get series ID #Get details table. table_details = response.css('#bt tr') #Get english title english_title = table_details[1].css('td:nth-child(2)::text').extract() #Get romaji title romanji_title = table_details[2].css('td:nth-child(2)::text').extract() #Get furigana title furigana_title = table_details[3].css( 'td:nth-child(2)::text').extract() #Get japanese title japanese_title = table_details[4].css( 'td:nth-child(2)::text').extract() #Get synopse/Description synopsis = response.css( 'div.middleframe:nth-child(3) > div:nth-child(2) #besttable') synopsis = synopsis[2].css('::text').extract() #Get type type = response.css( 'div.middleframe:nth-child(3) > div:nth-child(2) > h3:nth-child(6)::text' ).extract() if not type: type = response.css( 'div.middleframe:nth-child(3) > div:nth-child(2) > h3:nth-child(7)::text' ).extract() if not type: util.Log(response.url, "Verificar type dessa entitdade.", False) type = response.css( 'div.middleframe:nth-child(3) > div:nth-child(2) > h3:nth-child(8)::text' ).extract() if not franchise: #Get related work related_url = response.css( '#tile > ul:nth-child(1) li a::attr(href)').extract() #Collection name notice = response.css('.notice_inner::text').extract() notice_name = response.css( '.notice_inner > a:nth-child(2)::text').extract() notice_url = response.css( '.notice_inner > a:nth-child(2)::attr(href)').extract() #Get images images = response.css( 'div.middleframe:nth-child(6) > div:nth-child(2) img::attr(src)' ).extract() images_url = response.css( 'div.middleframe:nth-child(6) > div:nth-child(2) a::attr(href)' ).extract() front_image = response.css( '.vector table tr > td:nth-child(4) > a:nth-child(1) > img:nth-child(1)::attr(src)' ).extract() else: #Get associated entities entities_url = response.css( 'div.middleframe:nth-child(3) > div:nth-child(2) > ul:nth-child(14) li a::text' ).extract() entities_text = response.css( 'div.middleframe:nth-child(3) > div:nth-child(2) > ul:nth-child(14) li a::text' ).extract() try: series_title = util.sanitize_title(series_title) series_title = re.sub(self.pattern_replace_name, '', series_title) first = True aliases = [] new_search = series_title.split('/') for part in new_search: if first: series_title = util.sanitize_title(part) first = False else: if part: new_title = {} new_title['title'] = util.sanitize_title(part) new_title['language_id'] = self.dbase.language_en aliases.append(new_title) #Format type. get from series_title, types avaliable: Franchise, Light Novel, Manga, Anime, Visual Novel, H-Game, OVA, ONA - Original Net Animation, Video Game, Movie, Drama CD type = util.sanitize_title(type) type_name = re.sub(self.pattern_parenthisis_right, '', type) type_name = re.sub(self.pattern_parenthisis_left, '', type_name) if "H-Game" in type_name: type_id = self.dbase.entity_type_erogame elif "OVA" in type_name: type_id = self.dbase.entity_type_ova elif "Movie" in type_name: type_id = self.dbase.entity_type_anime_movie else: type_id = self.dbase.add_type(type_name, 'entity') if update_id == None: #if is manga, light novel, or book check if there is if type_id == self.dbase.entity_type_manga or type_id == self.dbase.entity_type_lightnovel or type_id == self.dbase.entity_type_manhaw or type_id == self.dbase.entity_type_manhua: new_name = [] if type_id == self.dbase.entity_type_lightnovel or type_id == self.dbase.entity_type_webnovel: new_search_title = series_title + " (Novel)" new_name.append(new_search_title) update_id = self.dbase.get_var('entity_alias', ['entity_id'], "name = %s", new_name) #Format alias. Separe alias by / english_title = util.sanitize_title(english_title) if english_title: new_nme = english_title.split('/') for new in new_nme: new = util.sanitize_title(new) if new: new_title = {} new_title['title'] = new new_title['language_id'] = self.dbase.language_en aliases.append(new_title) romanji_title = util.sanitize_title(romanji_title) if romanji_title: new_nme = romanji_title.split('/') for new in new_nme: new = util.sanitize_title(new) if new: new_title = {} new_title['title'] = new new_title['language_id'] = self.dbase.language_ja aliases.append(new_title) furigana_title = util.sanitize_title(furigana_title) if furigana_title: new_nme = furigana_title.split('/') for new in new_nme: new = util.sanitize_title(new) if new: new_title = {} new_title['title'] = new new_title['language_id'] = self.dbase.language_ja aliases.append(new_title) japanese_title = util.sanitize_title(japanese_title) if japanese_title: new_nme = japanese_title.split('/') for new in new_nme: new = util.sanitize_title(new) if new: new_title = {} new_title['title'] = new new_title['language_id'] = self.dbase.language_ja aliases.append(new_title) comments = [] #Format table details #episodes = 0 #ova_episodes = 0 release_date, origin_entity_id, origin_type_id, origin_type = None, None, None, None genres_id = [] aliases_company, companies, peoples, wikies = [], [], [], [] found_origin = False classification_type_id = self.dbase.classification_type_12 for item in table_details[5:]: new_item = util.sanitize_title(item.css('th::text').extract()) if not new_item: new_item = util.sanitize_title( item.css('th a::text').extract()) new_content_url_text = item.css('td a::text').extract() new_content_url = item.css('td a::attr(href)').extract() new_content_text = item.css('td::text').extract() if new_item and (new_content_url_text or new_content_text): #print new_item #Check release date if new_item == "Release Date": release_date = util.sanitize_title( new_content_url_text) if not release_date: release_date = util.sanitize_title( new_content_text) #Check studios elif "Studio Name" in new_item: if new_item == "English Studio Name" or new_item == "Japanese Studio Name": if new_item == "Japanese Studio Name": language_company = self.dbase.language_ja else: language_company = self.dbase.language_en for index, url in enumerate(new_content_url): company_name = util.sanitize_title( new_content_url_text[index]) if company_name: new_alias = {} new_alias['url'] = re.sub( self.pattern_language, '', self.get_formatted_link(url)) new_alias['name'] = company_name new_alias['language_id'] = language_company aliases_company.append(new_alias) #Check publisher and developer. elif re.search(self.pattern_companies, new_item) != None: #Get company id from alias. company_name = util.sanitize_title( new_content_url_text) if not company_name: company_name = util.sanitize_title( new_content_text) if company_name: #Get function type: function_type = self.dbase.add_type( new_item.title(), 'company_function') new_company_name = company_name.split(',') for company_name in new_company_name: company_id = None if company_name: where_values = [] where_values.append(company_name) company_id = self.dbase.get_var( 'company_alias', ['company_id'], "name = %s", where_values) #Get relation_type if not company_id: #create dummy. company_id = self.dbase.create_company( company_name, self.dbase.language_ja, self.dbase.country_jp) company = {} company['id'] = company_id company['function_type_id'] = function_type if not franchise: companies.append(company) else: companies.append(company_id) #Check ratings elif new_item == "Content Rating": #Format rating ratings = util.sanitize_title(new_content_text) if not ratings: ratings = util.sanitize_title(new_content_url_text) if ratings: if "Mature" in ratings: classification_type_id = self.dbase.classification_type_17 elif "Everyone" in ratings: classification_type_id = self.dbase.classification_type_free elif "Child" in ratings: classification_type_id = self.dbase.classification_type_3 elif "10+" in ratings: classification_type_id = self.dbase.classification_type_10 elif "Teen" in ratings: classification_type_id = self.dbase.classification_type_13 elif "Adult" in ratings: classification_type_id = self.dbase.classification_type_18 #Check genre elif new_item == "Genre Tags": #Format genre genres = new_content_url_text for genre in genres: new_genre = util.sanitize_title(genre) if new_genre: #Create genre genre_id = self.dbase.add_name_to_table( new_genre.title(), 'genre') #if genre_id: Dont need to check, if not save will raise a valueerror. new_genre = {} new_genre['id'] = genre_id genres_id.append(new_genre) #Check links elif new_item == "Links": #Format wikis for index, link in enumerate(new_content_url): if re.search(self.pattern_jp, link) != None: link_language = self.dbase.language_ja elif re.search(self.pattern_pt, link) != None: link_language = self.dbase.language_pt else: link_language = self.dbase.language_en wiki = {} wiki['name'] = new_content_url_text[index] wiki['url'] = link wiki['language_id'] = link_language wikies.append(wiki) #Check episodes number (Epidoses, OVA) elif re.search(self.pattern_episodes, new_item) != None: #check if OVA #if re.search(ur'\b[OovVAa]{3}\b', new_item) != None: #else: content = util.sanitize_title(new_content_url_text) if not content: content = util.sanitize_title(new_content_text) if content: comment = {} comment['title'] = 'Cralwer Episodes Number' comment['content'] = content comments.append(comment) #Check origin elif re.search(self.pattern_origin, new_item) != None: #If Origin in manga origin_type = util.sanitize_title(new_content_text) if not origin_type: origin_type = util.sanitize_title( new_content_url_text) if origin_type: where_values = [] where_values.append(origin_type.title()) origin_type_id = self.dbase.get_var( 'entity_type', ['id'], "name = %s", where_values) if origin_type_id: found_origin = True where_values = [] where_values.append(origin_type_id) where_values.append(series_title) where_values.append(series_title) origin_entity_id = self.dbase.get_var( 'entity', ['entity.id'], "entity.entity_type_id = %s and (%s like '%%' || entity_alias.name || '%%' or entity_alias.name = %s)", where_values, ['entity_alias'], ["entity_alias.entity_id = entity.id"]) if not found_origin: util.Log(response.url, "not found origin type", False) #Check people (Director, Author, Artist, Writer, Composer, ADR Director, Character Design, Illustrator, Scenario elif re.search(self.pattern_people, new_item) != None: #Get people id from alias. people_name = util.sanitize_title(new_content_url_text) if not people_name: people_name = util.sanitize_title(new_content_text) if people_name: new_people_name = people_name.split(',') for people_name in new_people_name: people_id, alias_used_id = None, None people_name = util.get_formatted_name( people_name, True) if people_name: #Get relation type relation_type_id = self.dbase.add_type( new_item.title(), 'produces') where_values = [] where_values.append(people_name['name']) where_values.append( people_name['lastname']) alias_used_id = self.dbase.get_var( 'people_alias', ['id'], "name = %s and lastname = %s", where_values) #Get relation_type if not alias_used_id: #create dummy. people_id = self.dbase.create_people( people_name['name'], people_name['lastname'], self.dbase.country_jp) #Get alias. where_values = [] where_values.append(people_id) alias_used_id = self.dbase.get_var( 'people_alias', ['id'], "people_id = %s", where_values) else: #get people_id where_values = [] where_values.append(alias_used_id) people_id = self.dbase.get_var( 'people_alias', ['people_id'], "id = %s", where_values) people = {} people['id'] = people_id people['alias_used_id'] = alias_used_id people[ 'relation_type_id'] = relation_type_id peoples.append( people ) #There is no plural for multiple individual but, I don`t care. #Check twitter elif re.search(self.pattern_twitter, new_item) != None: #Add twitter as comment. comment = {} comment['title'] = 'Cralwer Twitter new_item' comment['content'] = new_content_url comments.append(comment) else: content = util.sanitize_content(new_content_url_text) if not content: content = util.sanitize_content(new_content_text) else: second_content = util.sanitize_content( new_content_text) if second_content: content = content + '\n' + second_content if content: #save comment comment = {} comment[ 'title'] = 'Cralwer unknown new_item' + new_item comment['content'] = content comments.append(comment) #Format companies creator #Get company id from spider_item or alias for item in aliases_company: company_update_id = self.dbase.get_spider_item_id( self.get_formatted_link(item['url']), 'company') if not company_update_id: where_values = [] where_values.append(item['name']) company_update_id = self.dbase.get_var( 'company_alias', ['company_id'], "name = %s", where_values) alternate_names = [] if not company_update_id: company_current_alias = item['name'] else: company_current_alias = None new_alias = {} new_alias['name'] = item['name'] new_alias['language_id'] = item['language_id'] alternate_names.append(new_alias) #Create dummy. This method return the company ID and add new aliases if the company already exists. The new alias will be named as romanized type. company_id = self.dbase.create_company( company_current_alias, item['language_id'], self.dbase.country_jp, None, None, None, None, None, [], [], [], [], [], [], [], alternate_names, company_update_id) #Don't need to check if company_id is True because a error is raise if not True. new_company = {} new_company['id'] = company_id new_company[ 'function_type_id'] = self.dbase.company_function_type_creator if not franchise: companies.append(new_company) else: companies.append(company_id) self.dbase.add_spider_item('company', company_id, item['url'], False) if not franchise: #Format related work relateds = [] for url in related_url: #Check if already registered, else create dummy without name. new_url = self.get_formatted_link(url) related_id = self.dbase.get_spider_item_id( new_url, 'entity') if not related_id: #create dummy. related_id = self.dbase.create_entity( None, type_id, self.dbase.classification_type_12, self.dbase.language_ja, self.dbase.country_jp) self.dbase.add_spider_item('entity', related_id, new_url) new_related = {} new_related['id'] = related_id new_related['type_id'] = self.dbase.based_type_sequel relateds.append(new_related) #Format images new_images = [] for image in images: image_array = self.get_formatted_link(image).split('.') new_image = {} new_image['url'] = self.get_formatted_link(image) new_image['extension'] = image_array.pop() new_image_name = image_array.pop() new_image_name = new_image_name.split('/') new_image['name'] = new_image_name.pop() new_images.append(new_image) for image in front_image: image_array = self.get_formatted_link(image).split('.') new_image = {} new_image['url'] = self.get_formatted_link(image) new_image['extension'] = image_array.pop() new_image_name = image_array.pop() new_image_name = new_image_name.split('/') new_image['name'] = new_image_name.pop() new_images.append(new_image) #Format collection collection_id = None collection_started = 'False' if "extends" in notice: #Get collection from spider item if notice_url: new_url_collection = self.get_formatted_link( notice_url[0]) collection_id = self.dbase.get_spider_item_id( new_url_collection, 'new_url_collection') if not collection_id and notice_name: notice_name = util.sanitize_title(notice_name) notice_name = re.sub(self.pattern_replace_name, '', notice_name) where_values = [] where_values.append(notice_name) collection_id = self.dbase.get_col( 'collection', 'id', "%s LIKE '%%' || name || '%%'", where_values) if not collection: #create collection collection_name = util.normalize_collection_name( util.normalize_collection_name(notice_name)) collection_id = self.dbase.create_collection( collection_name) if new_url_collection: add_spider_item('collection', collection_id, new_url_collection) elif (isinstance(collection_id, collections.Iterable) and not isinstance(collection_id, types.StringTypes)): #return the element most appear on list collections = [] for new_id in collection_id: collections.append(new_id[0]) collection_id = util.most_common_oneliner( collections) if not collection_id: #Check if name is similar to another collection already registered. Only check if name is larger then 3 characters. #This method can have mismatch collection names and collections will need to be check after all items was crawled using get_related_item. if (len(series_title) > 3): series_name = [] series_name.append(series_title) collection_id = self.dbase.get_col( 'collection', 'id', "%s LIKE '%%' || name || '%%'", series_name) if not collection_id: #create new collection with the first name type, get firstname part using regex. original_name = re.sub(self.pattern_replace_name, '', series_title) if not original_name: original_name = series_title original_name = util.normalize_collection_name( original_name) collection_id = self.dbase.create_collection( original_name) elif (isinstance(collection_id, collections.Iterable) and not isinstance(collection_id, types.StringTypes)): #return the element most appear on list collections = [] for new_id in collection_id: collections.append(new_id[0]) collection_id = util.most_common_oneliner( collections) #Format language and country if type_id == self.dbase.entity_type_manhaw: language_id = self.dbase.language_ko country_id = self.dbase.country_kr elif type_id == self.dbase.entity_type_manhua: language_id = self.dbase.language_zh country_id = self.dbase.country_cn else: #Format language language_id = self.dbase.language_ja #Format country country_id = self.dbase.country_jp #Format classification if type_id == self.dbase.entity_type_erogame and classification_type_id != self.dbase.classification_type_18: classification_type_id = self.dbase.classification_type_18 #Format origin if not origin_entity_id and found_origin: #Create dummy origin. if aliases: new_title = aliases[0]['title'] origin_entity_id = self.dbase.create_entity( new_title, origin_type_id, classification_type_id, language_id, country_id) #Format synopsis synopses = [] if synopsis: synopis_content = util.sanitize_content(synopsis) if synopis_content: synops = {} synops['language_id'] = self.dbase.language_en synops['content'] = synopis_content synopses.append(synops) else: #Format name franchise_name = series_title.replace('(Franchise)', '') franchise_name = re.sub(self.pattern_replace_name, '', franchise_name) #Format description description = None if synopsis: description = util.sanitize_content(synopsis) entities = [] #Format associated entities for index, entity in enumerate(entities_text): entity_name = util.sanitize_title(entities_text) if entity_name: #Get id from spider_item entity_id = self.dbase.get_spider_item_id( self.get_formatted_link(entities_url[index]), 'entity') if not entity_id: #Get id from alias. where_values = [] where_values.append(entity_name) entity_id = self.dbase.get_var( 'entity_alias', ['entity_id'], "name = %s", where_values) if not entity_id: #Create dummy. entity_id = self.dbase.create_entity( entity_name, type_id, self.dbase.classification_type_12, self.dbase.language_ja, self.dbase.country_jp) self.dbase.add_spider_item('entity', entity_id, entities_url[index]) entities.append(entity_id) except ValueError as e: if not franchise: print "Error on formatting and getting IDs to save Series", e.message else: print "Error on formatting and getting IDs to save Franchise", e.message util.PrintException() util.Log(response.url, e.message) return except: if not franchise: print "Error on formatting Series", sys.exc_info()[0] else: print "Error on formatting Franchise", sys.exc_info()[0] util.PrintException() util.Log(response.url, sys.exc_info()[0]) return try: self.dbase.set_auto_transaction(False) if franchise: collection_id = self.dbase.create_collection( franchise_name, description, [], companies, aliases, self.dbase.language_ja, update_id, entities) self.dbase.add_spider_item('collection', collection_id, response.url, True) else: entity_id = self.dbase.create_entity( series_title, type_id, classification_type_id, language_id, country_id, release_date, collection_id, collection_started, aliases, [], synopses, wikies, [], [], [], genres_id, [], companies, [], relateds, None, new_images, update_id) if origin_entity_id: self.dbase.add_relation_with_type( 'entity', 'entity', origin_entity_id, entity_id, 'based', self.dbase.based_type_adapted_from) for comment in comments: self.dbase.add_comment(comment['title'], comment['content'], 1, entity_id, 'entity') self.dbase.add_spider_item('entity', entity_id, response.url, True) self.dbase.commit() print "Success" except ValueError as e: self.dbase.rollback() print "Error on save Series", e.message util.PrintException() util.Log(response.url, e.message) except: self.dbase.rollback() print "Error on save Series", sys.exc_info()[0] util.PrintException() util.Log(response.url, sys.exc_info()[0]) finally: self.dbase.set_auto_transaction(True)