def copy_variable_ref_to_graph(input_graph,
                               output_graph,
                               var_ref,
                               init_value,
                               scope=''):
    if scope != '':
        new_name = (scope + '/' + var_ref.name[:var_ref.name.index(':')])
    else:
        new_name = var_ref.name[:var_ref.name.index(':')]
    collections = []
    for name, collection in input_graph._collections.items():
        if var_ref in collection:
            if (name == ops.GraphKeys.GLOBAL_VARIABLES
                    or name == ops.GraphKeys.TRAINABLE_VARIABLES
                    or scope == ''):
                collections.append(name)
            else:
                collections.append(scope + '/' + name)
    trainable = (var_ref in input_graph.get_collection(
        ops.GraphKeys.TRAINABLE_VARIABLES))
    with output_graph.as_default():
        new_var = Variable(init_value,
                           trainable,
                           name=new_name,
                           collections=collections,
                           validate_shape=False)
        new_var.set_shape(init_value.shape)
    return new_var
Esempio n. 2
0
    def set_predefined_obj(self, key, obj):
        collections = []
        c = self.collection

        while True:
            collections.append(c)
            c = c.parent
            if c is None:
                break

        collections.reverse()

        old_value = None
        value = None

        for collection in collections:
            attribute = Attribute(key)
            attribute.parent = self
            attribute.revise(obj)
            collection.attributes[key] = attribute

            if isinstance(obj.get_value(), Instance) or isinstance(
                    obj.get_value(), FuncValue) or isinstance(
                        obj.get_value(), ModuleValue):
                continue

            collection.inputs[attribute] = (attribute.get_obj(),
                                            attribute.get_obj().get_value(),
                                            attribute.get_obj().get_value(),
                                            attribute.get_obj().get_value())
Esempio n. 3
0
    def from_configuration(
        cls,
        name,
        crs,
        name_dir_pairs,
        glob_pattern='*.tif',
        img_collection_cls=ImageCollection,
    ):
        """
        Creates a NestedImageCollection given the [collection name, directory] pairs.
        This is very convenient functionality for simple configuration level creation
        of this complex object.

        For example, to produce a nested collection of OS map tiles::

            r = NestedImageCollection.from_configuration('os',
                                                 ccrs.OSGB(),
                                                 [['OS 1:1,000,000', '/directory/to/1_to_1m'],
                                                  ['OS 1:250,000', '/directory/to/1_to_250k'],
                                                  ['OS 1:50,000', '/directory/to/1_to_50k'],
                                                  ],
                                                 )

        """
        collections = []
        for collection_name, collection_dir in name_dir_pairs:
            collection = img_collection_cls(collection_name, crs)
            collection.scan_dir_for_imgs(collection_dir,
                                         glob_pattern=glob_pattern)
            collections.append(collection)
        return cls(name, crs, collections)
Esempio n. 4
0
    def from_configuration(cls, name, crs, name_dir_pairs,
                           glob_pattern='*.tif',
                           img_collection_cls=ImageCollection):
        """
        Creates a NestedImageCollection given the [collection name, directory]
        pairs. This is very convenient functionality for simple configuration
        level creation of this complex object.

        For example, to produce a nested collection of OS map tiles::

            files = [['OS 1:1,000,000', '/directory/to/1_to_1m'],
                     ['OS 1:250,000', '/directory/to/1_to_250k'],
                     ['OS 1:50,000', '/directory/to/1_to_50k'],
                    ]
            r = NestedImageCollection.from_configuration('os',
                                                         ccrs.OSGB(),
                                                         files,
                                                         )

        """
        collections = []
        for collection_name, collection_dir in name_dir_pairs:
            collection = img_collection_cls(collection_name, crs)
            collection.scan_dir_for_imgs(collection_dir,
                                         glob_pattern=glob_pattern)
            collections.append(collection)
        return cls(name, crs, collections)
Esempio n. 5
0
def document_collection(resource, path, root_discovery, discovery, css=CSS):
    """Document a single collection in an API.

  Args:
    resource: Collection or service being documented.
    path: string, Dot separated name of the resource.
    root_discovery: Deserialized discovery document.
    discovery: Deserialized discovery document, but just the portion that
      describes the resource.
    css: string, The CSS to include in the generated file.
  """
    collections = []
    methods = []
    resource_name = path.split(".")[-2]
    html = [
        "<html><body>",
        css,
        "<h1>%s</h1>" % breadcrumbs(path[:-1], root_discovery),
        "<h2>Instance Methods</h2>",
    ]

    # Which methods are for collections.
    for name in dir(resource):
        if not name.startswith("_") and callable(getattr(resource, name)):
            if hasattr(getattr(resource, name), "__is_resource__"):
                collections.append(name)
            else:
                methods.append(name)

    # TOC
    if collections:
        for name in collections:
            if not name.startswith("_") and callable(getattr(resource, name)):
                href = path + name + ".html"
                html.append(
                    string.Template(COLLECTION_LINK).substitute(href=href, name=name)
                )

    if methods:
        for name in methods:
            if not name.startswith("_") and callable(getattr(resource, name)):
                doc = getattr(resource, name).__doc__
                params = method_params(doc)
                firstline = doc.splitlines()[0]
                html.append(
                    string.Template(METHOD_LINK).substitute(
                        name=name, params=params, firstline=firstline
                    )
                )

    if methods:
        html.append("<h3>Method Details</h3>")
        for name in methods:
            dname = name.rsplit("_")[0]
            html.append(method(name, getattr(resource, name).__doc__))

    html.append("</body></html>")

    return "\n".join(html)
Esempio n. 6
0
    def from_configuration(cls, name, crs, name_dir_pairs,
                           glob_pattern='*.tif',
                           img_class=Img):
        """
        Creates a :class:`~cartopy.io.img_nest.NestedImageCollection` instance
        given the list of image collection name and directory path pairs.

        This is very convenient functionality for simple configuration level
        creation of this complex object.

        For example, to produce a nested collection of OS map tiles::

            files = [['OS 1:1,000,000', '/directory/to/1_to_1m'],
                     ['OS 1:250,000', '/directory/to/1_to_250k'],
                     ['OS 1:50,000', '/directory/to/1_to_50k'],
                    ]
            r = NestedImageCollection.from_configuration('os',
                                                         ccrs.OSGB(),
                                                         files)

        .. important::
            The list of image collection name and directory path pairs must be
            given in increasing resolution order i.e. from low resolution to
            high resolution.

        Args:

        * name:
            The name for the
            :class:`~cartopy.io.img_nest.NestedImageCollection` instance.

        * crs:
            The :class:`~cartopy.crs.Projection` of the image collection.

        * name_dir_pairs:
            A list of image collection name and directory path pairs.

        Kwargs:

        * glob_pattern:
            The image collection filename glob pattern.
            Defaults to '*.tif'.

        * img_class:
            The class of images created in the image collection.

        Returns:
            A :class:`~cartopy.io.img_nest.NestedImageCollection` instance.

        """
        collections = []
        for collection_name, collection_dir in name_dir_pairs:
            collection = ImageCollection(collection_name, crs)
            collection.scan_dir_for_imgs(collection_dir,
                                         glob_pattern=glob_pattern,
                                         img_class=img_class)
            collections.append(collection)
        return cls(name, crs, collections)
Esempio n. 7
0
    def from_configuration(cls,
                           name,
                           crs,
                           name_dir_pairs,
                           glob_pattern='*.tif',
                           img_class=Img):
        """
        Create a :class:`~cartopy.io.img_nest.NestedImageCollection` instance
        given the list of image collection name and directory path pairs.

        This is very convenient functionality for simple configuration level
        creation of this complex object.

        For example, to produce a nested collection of OS map tiles::

            files = [['OS 1:1,000,000', '/directory/to/1_to_1m'],
                     ['OS 1:250,000', '/directory/to/1_to_250k'],
                     ['OS 1:50,000', '/directory/to/1_to_50k'],
                    ]
            r = NestedImageCollection.from_configuration('os',
                                                         ccrs.OSGB(),
                                                         files)

        Parameters
        ----------
        name
            The name for the
            :class:`~cartopy.io.img_nest.NestedImageCollection` instance.
        crs
            The :class:`~cartopy.crs.Projection` of the image collection.
        name_dir_pairs
            A list of image collection name and directory path pairs.
        glob_pattern: optional
            The image collection filename glob pattern. Defaults to '*.tif'.
        img_class: optional
            The class of images created in the image collection.

        Returns
        -------
        A :class:`~cartopy.io.img_nest.NestedImageCollection` instance.

        Warnings
        --------
            The list of image collection name and directory path pairs must be
            given in increasing resolution order i.e. from low resolution to
            high resolution.

        """
        collections = []
        for collection_name, collection_dir in name_dir_pairs:
            collection = ImageCollection(collection_name, crs)
            collection.scan_dir_for_imgs(collection_dir,
                                         glob_pattern=glob_pattern,
                                         img_class=img_class)
            collections.append(collection)
        return cls(name, crs, collections)
Esempio n. 8
0
def _sift_tasks(mapping):
    tasks, collections = [], []
    for name, value in list(mapping.items()):
        if _is_task(name, value):
            tasks.append(name)
        elif isMappingType(value):
            collections.append(name)
    tasks = sorted(tasks)
    collections = sorted(collections)
    return tasks, collections
Esempio n. 9
0
def _sift_tasks(mapping):
    tasks, collections = [], []
    for name, value in iteritems(mapping):
        if _is_task(name, value):
            tasks.append(name)
        elif isMappingType(value):
            collections.append(name)
    tasks = sorted(tasks)
    collections = sorted(collections)
    return tasks, collections
Esempio n. 10
0
    def _make_skos_collections(self):
        collections = []
        for k, v in self.COLLECTIONS.items():
            collections.append((
                v["default_prefLabel"],
                v["fid"],
                self._make_skos_collection((k, v)),
            ))

        return self._load_template("collections." + self.outputformat).render(
            collections=collections)
Esempio n. 11
0
def getCollectionList():
    global fileList
    global fileCollections
    if debug: syslog.syslog("getCollectionList")
    getFileCollections()
    collections = []
    for k in sorted(fileCollections.keys()):
        if debug: syslog.syslog("found collection:" + str(k))
        collections.append(k)
    status = {'status': 'ok', 'collections': collections}
    rval = json.dumps(status)
    #if debug: syslog.syslog("getSoundList():"+rval)
    return rval
Esempio n. 12
0
def addmodule(request):
    if request.method == "POST":
        form = forms.ModuleForm(request.POST)
        if form.is_valid():
            module = form.cleaned_data["module"]
            collections = []
            for i in form.cleaned_data['collections']:
                collections.append(get_object_or_404(models.Collection, pk=int(i)))
            jobs.create_module(module, collections)
            return HttpResponseRedirect(reverse('docserver-manager'))
    else:
        form = forms.ModuleForm()
    ret = {"form": form}
    return render(request, 'docserver/addmodule.html', ret)
Esempio n. 13
0
def setup():
    global currentCollection
    global collections
    global timeout
    global rootDir
    currentCollection = ""

    for d in Specs().s['collections']:
        collections.append(d)

    if debug: print(collections)
    currentCollection = collections.pop(0)
    if debug: print("currentCollection:", currentCollection['name'])

    timeout = time.time() + currentCollection['time']
Esempio n. 14
0
def getCollectionList():
  global fileList
  global fileCollections
  if debug: syslog.syslog("getCollectionList")
  flen = len(fileList)
  if flen == 0:
    createFileList()
    flen = len(fileList)
  collections = [];
  for k in sorted(fileCollections.keys()):
      if debug: syslog.syslog("found collection:"+str(k))
      collections.append(k)
  status = { 'status' : 'ok' , 'collections' : collections }
  rval = json.dumps(status)
  #if debug: syslog.syslog("getSoundList():"+rval)
  return rval 
Esempio n. 15
0
def compact_range_dumps(li):
    """
    Accepts a list of integers and represent it as intervals
    [1,2,3,4,6,7] => '1-4,6-7'
    """
    li = sorted(li)
    low = None
    high = None
    collections = []
    for i,number in enumerate(li):
        number = li[i]
        if low is None:
            low = number
            high = number
        elif high + 1 == number:
            high = number
        else:
            collections.append('{}-{}'.format(low, high))
            low = None
            high = None
    collections.append('{}-{}'.format(low, high))
    return ','.join(collections)
Esempio n. 16
0
def query(query, uris=None, exact=False):
    if exact:
        raise ValueError('Exact queries not supported')
    terms = []
    for key, values in (query.iteritems() if query else []):
        try:
            term = _QUERYMAP[key](values)
        except KeyError:
            raise ValueError('Keyword "%s" not supported' % key)
        else:
            terms.append(term)
    collections = []
    for uri in uris or []:
        parts = uritools.urisplit(uri)
        if parts.path:
            collections.append(parts.path)
        elif not parts.query and not parts.fragment:
            pass  # root URI?
        else:
            raise ValueError('Cannot search "%s"' % uri)
    if collections:
        terms.append('collection:(%s)' % ' OR '.join(collections))
    return ' AND '.join(terms)
Esempio n. 17
0
def query(query, uris=None, exact=False):
    if exact:
        raise ValueError("Exact queries not supported")
    terms = []
    for key, values in query.items() if query else []:
        try:
            term = _QUERYMAP[key](values)
        except KeyError:
            raise ValueError('Keyword "%s" not supported' % key)
        else:
            terms.append(term)
    collections = []
    for uri in uris or []:
        parts = uritools.urisplit(uri)
        if parts.path:
            collections.append(parts.path)
        elif not parts.query and not parts.fragment:
            pass  # root URI?
        else:
            raise ValueError('Cannot search "%s"' % uri)
    if collections:
        terms.append("collection:(%s)" % " OR ".join(collections))
    return " AND ".join(terms)
Esempio n. 18
0
def DrawGLScene():
    global lineList, showGrid
    showGrid = True

    glLoadIdentity()
    glOrtho(0, winWidth, winHeight, 0, 0.0, 100.0)
    glClearColor(1, 1, 1, 1)
    glClearDepth(1.0)
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
    glColor3f(0, 0, 0)
    glLineWidth(4)
    gridSize = msHandler.getGridSize()

    if showGrid:
        glBegin(GL_LINES)
        for i in range(0, winHeight, gridSize):
            glVertex2f(i, 0)
            glVertex2f(i, winHeight)

            glVertex2f(0, i)
            glVertex2f(winHeight, i)
        glEnd()

    lineLists = msHandler.getLineLists()
    k = 0
    #    print("STARTING----------------")
    #    print(lineLists[0])
    #    print(lineLists[1])
    #    print(lineLists[2])

    all_graphs = []
    for l in lineLists:
        Graph = MS_Graph(l)
        graph = Graph.getGraph()
        #graph = getGraph(lineList)
        graphs = Graph.BFS(graph)
        j = 0
        for graph in graphs:
            all_graphs.append(graph)

    collections = []
    for idx1, g1 in enumerate(all_graphs):  # you have to use the vertices
        if idx1 == len(all_graphs) - 1:
            break
        for idx2, g2 in enumerate(all_graphs):
            if g1 == g2:
                pass
            if Graph.containsGraph(g1, g2) or Graph.containsGraph(g2, g1):
                collections.append((g1, g2))

    print("\n\n\n", collections)

    for idx, collection in enumerate(collections):
        for graph in collection:
            vertices = np.array(list(graph))
            glBegin(GL_LINES)
            if idx == 0 or idx == 10:  # if it is 9*n, with n = 0, 1, 2, ..., n
                glColor3f(1.0, 0.0, 0.0)
            if idx == 1 or idx == 11:  # if it is 1 + 10*n
                glColor3f(0.0, 1.0, 0.0)
            if idx == 2 or idx == 12:
                glColor3f(0.0, 0.0, 1.0)
            if idx == 3 or idx == 13:
                glColor3f(1.0, 1.0, 0.0)
            if idx == 4 or idx == 14:
                glColor3f(1.0, 0.0, 1.0)
            if idx == 5 or idx == 15:
                glColor3f(0.0, 1.0, 1.0)
            if idx == 6 or idx == 16:
                glColor3f(0.0, 0.5, 0.0)
            if idx == 7 or idx == 17:
                glColor3f(1.0, 0.5, 0.0)
            if idx == 8 or idx == 18:
                glColor3f(0.5, 1.0, 0.0)
            if idx == 9 or idx == 19:
                glColor3f(0.5, 0.0, 1.0)
            for i in range(len(vertices) - 1):
                if i == 0:
                    glVertex2f(graph[-1][0], graph[-1][1])
                    glVertex2f(graph[i][0], graph[i][1])
                glVertex2f(graph[i][0], graph[i][1])
                glVertex2f(graph[i + 1][0], graph[i + 1][1])
            glEnd()

    #print("I am printing collections\n",collections)
    #tmp = [tuple(tuple(j) for j in i) for i in lineList]
    #graph = nx.Graph(tmp);
    #j = 0
    #for idx, graph in enumerate(nx.connected_components(graph)):

    #vertices = np.array(list(graph))
    #glBegin(GL_LINES)
    #if j == 0:
    #glColor3f(1.0,0.0,0.0)
    #if j == 1:
    #glColor3f(0.0,1.0,0.0)
    #if j == 2:
    #glColor3f(0.0,0.0,1.0)
    #if j == 3:
    #glColor3f(1.0,1.0,0.0)
    #if j == 4:
    #glColor3f(1.0,0.0,1.0)
    #if j == 5:
    #glColor3f(0.0,1.0,1.0)
    #if j == 6:
    #glColor3f(0.0,0.5,0.0)
    #if j == 7:
    #glColor3f(1.0,0.5,0.0)
    #if j == 8:
    #glColor3f(0.5,1.0,0.0)
    #j+=1
    #for i in range(len(vertices)-1):
    #glVertex2f(vertices[i][0],vertices[i][1])
    #glVertex2f(vertices[i+1][0],vertices[i+1][1])
    #glEnd()


#
#    for i in range(len(lineList)-1):
#        dline = lineList[i]
#        glVertex2f(dline[0][0],dline[0][1])
#        glVertex2f(dline[1][0],dline[1][1])
#    glEnd()

    glutSwapBuffers()
		def parse_series(self, response):
			print "Series"
			self.instancialize_database()			
			if not self.check_logged(response):
				return self.log_in(response)
				
			update_id = None
			try:
				#Check if there is a dummy, if there is update only. If there inst the id will be none
				update_id = self.dbase.get_spider_item_id(response.url, 'entity')
			except ValueError as e:
				print "Error on getting dummy id on Series", e.message
			except:
				print "Error on getting dummy on Series", sys.exc_info()[0]
				util.PrintException()
				
			#Get romanized title
			romanized_title = response.css('span.releasestitle.tabletitle::text').extract()

			#Get description
			description = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(2)::text').extract()
			
			#Get webnovel link
			webnovel_link = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(2) a::attr(href)').extract()
			
			
			#Get type
			type = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(5)::text').extract()
			
			#Get titles
			associated_name = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(11)::text').extract()
			
			#Get people
			author_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(17) a::attr(href)').extract()
			author_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(17) a u::text').extract()
			author_alias_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(17)::text').extract()
			
			artist_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(20) a::attr(href)').extract()
			artist_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(20) a u::text').extract()
			artist_alias_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(20)::text').extract()
			
			#Get company
			original_publisher_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(26) a::attr(href)').extract()
			original_publisher_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(26) a u::text').extract()
			original_publisher_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(26)::text').extract()
			
			serialized_publisher = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(29) a::attr(href)').extract()
			serialized_publisher_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(29) a u::text').extract()
			serialized_publisher_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(29)::text').extract()
			
			english_publisher_url = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(35) a::attr(href)').extract()
			english_publisher_alias = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(35) a u::text').extract()
			english_publisher_text = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(35)::text').extract()
			
			#Get year
			year = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div:nth-child(23)::text').extract()
			
			#Get related items.
			related = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(8) a::attr(href)').extract()
			related_text = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(8) a::text').extract()
			related_type = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(8)::text').extract()
			
			#Get status 
			status = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(20)::text').extract()
			
			#Get animé comparative			
			anime_start_end = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(26)::text').extract()

			#Get releases
			releases = response.css('div.sContainer:nth-child(3) > div:nth-child(1) > div:nth-child(17) a[rel=nofollow]::attr(href)').extract()

			#Get image
			images = response.css('div.sContainer:nth-child(4) > div:nth-child(1) > div center img::attr(src)').extract()
	
			#Get partial categories
			categories = response.css('li.tag_normal a::text').extract()
			
			try:	
			
				#format romanized title
				romanized_title = util.sanitize_title(romanized_title[0])
				
				#format description (synopsis)
				description = util.sanitize_content(description)
				
				descriptions = []
				if(description):
					new_description = {}
					new_description['language_id'] = self.dbase.language_en
					new_description['content'] = description
					descriptions.append(new_description)
				
				#format titles
				titles = []
				language_titles = []
				for name in associated_name:
					new_name = util.sanitize_title(name)
					if(new_name):
						language = langid.classify(new_name)
						language_titles.append(language[0])
						
						language_id = self.dbase.get_language_id_from_code(language[0])
						
						new_title = {}
						new_title['title'] = new_name
						new_title['language_id'] = language_id
						
						titles.append(new_title)
				
				webnovel = False
				country_id = None
				language_id = None
				
				#format partial categories
				if(categories):
					categories = " ".join(categories)
						
				#format type
				type = util.sanitize_content(type)
				if(type == None):
					entity_type_id = self.dbase.entity_type_manga
				elif(type == 'Manga'):
					entity_type_id = self.dbase.entity_type_manga
					country_id = self.dbase.country_jp
					language_id = self.dbase.language_ja
				elif(type == 'Manhaw'):
					entity_type_id = self.dbase.entity_type_manhaw
					country_id = self.dbase.country_kr
					language_id = self.dbase.language_ko
				elif(type == 'Manhua'):
					entity_type_id = self.dbase.entity_type_manhua
					country_id = self.dbase.country_cn
					language_id = self.dbase.language_zh
				elif(type == 'Novel'):
					new_status = " ".join(status)
						
					#if is there is Web Volumes or Web Chapters in status
					if "Web Novel" in categories or re.search("Web",new_status) != None:
						entity_type_id = self.dbase.entity_type_webnovel
						webnovel = True
					else:
						entity_type_id = self.dbase.entity_type_lightnovel
				else:
					#Add new type:
					entity_type_id = self.dbase.add_type(type, 'entity')
					if(entity_type_id == None):
						entity_type_id = self.dbase.entity_type_manga
				
				#format people
				#remove [ from name
				#get author. If author don't exists create dummy author   
				peoples = []
				
				author_alias_text = [x for x in author_alias_text if x != ']']
				relation_type_id = self.dbase.people_relation_type_writer
				
				for index, url in enumerate(author_url):
					add_dummy = False
					if 'add_author' in url:
						#Add dummy author
						try:
							people_name = util.get_formatted_name(util.sanitize_content(author_alias_text[index]))
						except IndexError as e:
							people_name = None
							
						if people_name:
							add_dummy = True
						else:
							util.Log(response.url, "Error on getting people name to insert dummy.", False)
					else:
						where_values = []
						where_values.append(url)
						where_values.append('people')
						#Get author id from link.
						people_id = self.dbase.get_var('spider_item', ['id'], "url = %s and table_name = %s", where_values)
						people_name = util.get_formatted_name(util.sanitize_content(author_alias[index]))
						if(people_id == None):
							if people_name:
								add_dummy = True
							else:
								util.Log(response.url, "Error on getting people name to insert dummy on line 295.", False)
						else:
							where_values = []
							where_values.append(people_name['name'])
							where_values.append(people_name['lastname'])
							where_values.append(people_id)
							alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values)
							if(alias_used_id == None):
								#Insert alias.
								alias_used_id = self.dbase.add_people_alias(people_name['name'], people_name['lastname'], people_id, self.dbase.alias_type_alias)
								
					if(add_dummy):
						people_country = self.dbase.country_jp
						people_id = self.dbase.create_people(people_name['name'], people_name['lastname'], people_country)
						
						where_values = []
						where_values.append(people_name['name'])
						where_values.append(people_name['lastname'])
						where_values.append(people_id)
						alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values)
						self.dbase.add_spider_item('people', people_id, url)
						#print "Added dummy people"
						
					if people_id and alias_used_id:
						new_people = {}
						new_people['id'] = people_id
						new_people['alias_used_id'] = alias_used_id
						new_people['relation_type_id'] = relation_type_id
						peoples.append(new_people)
					else:
						util.Log(response.url, "Error on getting people name author to insert.", False)
						
				artist_alias_text = [x for x in artist_alias_text if x != ']']
				relation_type_id = self.dbase.people_relation_type_illustrator
					
				for index, url in enumerate(artist_url):
					add_dummy = False
					if 'add_author' in url:
						#Add dummy author
						try:
							people_name = util.get_formatted_name(util.sanitize_content(artist_alias_text[index]))
						except IndexError as e:
							people_name = None
						
						if people_name:
							add_dummy = True
						else:
							util.Log(response.url, "Error on getting people name to insert dummy.", False)
					else:
						#Get author id from link.
						people_id = self.dbase.get_spider_item_id(url, 'people')
						people_name = util.get_formatted_name(util.sanitize_content(artist_alias[index]))
						if(people_id == None):
							if people_name:
								add_dummy = True
							else:
								util.Log(response.url, "Error on getting people name to insert dummy on line 350.", False)
						else:
							where_values = []
							where_values.append(people_name['name'])
							where_values.append(people_name['lastname'])
							where_values.append(people_id)
							alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values)
							if(alias_used_id == None):
								#Insert alias.
								alias_used_id = self.dbase.add_people_alias(people_name['name'], people_name['lastname'], people_id, self.dbase.alias_type_alias)
								
					if(add_dummy):						
						if(country_id != None):
							people_country = country_id
						else:
							people_country = self.dbase.country_jp
							
						people_id = self.dbase.create_people(people_name['name'], people_name['lastname'], people_country)
						
						where_values = []
						where_values.append(people_name['name'])
						where_values.append(people_name['lastname'])
						where_values.append(people_id)
						alias_used_id = self.dbase.get_var('people_alias', ['id'], "name = %s and lastname = %s and people_id = %s", where_values)
						self.dbase.add_spider_item('people', people_id, url)
						print "Added dummy people"
						
					if people_id and alias_used_id:
						new_people = {}
						new_people['id'] = people_id
						new_people['alias_used_id'] = alias_used_id
						new_people['relation_type_id'] = relation_type_id
						peoples.append(new_people)
					else:
						util.Log(response.url, "Error on getting people name artist to insert.", False)
				
				#format company
				companies = []
				
				company_publisher = False
				original_publisher_text = [x for x in original_publisher_text if x != ']']
				
				company_function_type_id = self.dbase.company_function_type_publisher
				
				#print original_publisher_url, original_publisher_alias, original_publisher_text
				if not "N/A" in original_publisher_url:
					for index, url in enumerate(original_publisher_url):
						add_dummy = False
						if 'add_publisher' in url:
							#Add dummy company
							company_name = util.sanitize_content(original_publisher_text[index])
							add_dummy = True
						else:
							#Get author id from link.
							company_id = self.dbase.get_spider_item_id(url, 'company')
							company_name = util.sanitize_content(original_publisher_alias[index])
							
							if(company_id == None):
								add_dummy = True
							else:
								where_values = []
								where_values.append(company_name)
								where_values.append(company_id)
								alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values)
								#print "Alias used id", alias_used_id
								#Get country_id from original publisher
								if(country_id == None):
									where_values = []
									where_values.append(company_id)
									country_id = self.dbase.get_var('company', ['country_id'], "id = %s", where_values)
									if(country_id == None):
										country_id = self.dbase.country_jp
										
								#Get language_id from original publisher
								if(language_id == None):
									language_id = self.dbase.get_language_from_country_id(country_id, self.dbase.language_ja)
								
								if(alias_used_id == None):
									#Insert alias.
									language = langid.classify(company_name)
									code = []
									code.append(language[0])
									language_id = self.dbase.get_var('language', ['id'], "code = %s", code)
									#print "Name :", company_name
									alias_used_id = self.dbase.add_alias(company_name, company_id, language_id, 'company', self.dbase.alias_type_alias)
									
						#print "Company name: ", company_name
						if(add_dummy):
							if(country_id != None):
								country_origin_id = country_id
							else:
								country_origin_id = self.dbase.country_jp
								
							if not language_id:
								language_id = self.dbase.language_ja
								
							company_id = self.dbase.create_company(company_name, language_id, country_origin_id, None, None, None, None, None,
							[], [], [], [], [], [], [], [])	
			
							where_values = []
							where_values.append(company_name)
							where_values.append(company_id)
							alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values)
							self.dbase.add_spider_item('company', company_id, url)
							print "Added dummy company"

						new_company = {}
						new_company['id'] = company_id
						new_company['function_type_id'] = company_function_type_id
						companies.append(new_company)
						
						company_publisher = True
					
				magazines = []
				#Get serialized maganize:
				for index, magazine in enumerate(serialized_publisher_alias):
					magazines.append(magazine + " " + serialized_publisher_text[index])
				
				magazines = util.sanitize_content(magazines)
				
				company_function_type_id = self.dbase.company_function_type_translator
				
				#Get english company:
				if not "N/A" in english_publisher_text:
					country_origin_id = self.dbase.country_us
					language_release = self.dbase.language_en
					
					for index, url in enumerate(english_publisher_url):
						add_dummy = False
						if 'add_publisher' in url:
							#Add dummy company
							company_name = util.sanitize_content(english_publisher_text[index])
							add_dummy = True
						else:
							#Get author id from link.
							company_id = self.dbase.get_spider_item_id(url, 'company')
							company_name = util.sanitize_content(english_publisher_alias[index])
							
							if(company_id == None):
								add_dummy = True
							else:
								where_values = []
								where_values.append(company_name)
								where_values.append(company_id)
								alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values)
								
								if(alias_used_id == None):
									#Insert alias.
									#print "Name :", company_name
									alias_used_id = self.dbase.add_alias(company_name, company_id, language_release, 'company', self.dbase.alias_type_alias)
									
						#print "Company name: ", company_name
						if(add_dummy):
								
							company_id = self.dbase.create_company(company_name, language_release, country_origin_id, None, None, None, None, None,
							[], [], [], [], [], [], [], [])	
			
							where_values = []
							where_values.append(company_name)
							where_values.append(company_id)
							alias_used_id = self.dbase.get_var('company_alias', ['id'], "name = %s and company_id = %s", where_values)
							self.dbase.add_spider_item('company', company_id, url)
							print "Added dummy company"

						new_company = {}
						new_company['id'] = company_id
						new_company['function_type_id'] = company_function_type_id
						companies.append(new_company)

						
				
				
				create_webnovel_also = False
					
				if(webnovel and company_publisher):
					create_webnovel_also = True
					entity_type_id = self.dbase.entity_type_lightnovel
						
				#format year
				year = util.sanitize_content(year[0])
				
				language = None
				
				#format country. Get country from associated name, if not found country_id will be Japan.
				if not country_id:
					language_country = {'ja': self.dbase.country_jp, 'ko': self.dbase.country_kr, 'zh': self.dbase.country_cn}
					
					language_test = {'ja': 0, 'ko': 0, 'zh' : 0}
					for title in titles:
						if title['language_id'] == self.dbase.language_ja:
							language_test['ja'] += 1
						elif title['language_id'] == self.dbase.language_ko:
							language_test['ko'] += 1
						elif title['language_id'] == self.dbase.language_zh:
							language_test['zh'] += 1
					
					if(language_test['ja'] == language_test['ko'] and language_test['ko'] == language_test['zh']):
						language = language_country['ja']
					else:
						language, value = max(language_test.iteritems(), key=lambda x: x[1])
					
					if language in ['ja', 'ko', 'zh']:
						country_id = language_country[language]
					else:
						country_id = self.dbase.country_jp
						
				if not language_id:
					if language in ['ja', 'ko', 'zh']:
						languages = {'ja': self.dbase.language_ja, 'ko': self.dbase.language_ko, 'zh': self.dbase.language_zh}
						language_id = languages[language]
					else:
						language_id = self.dbase.language_ja
					
				#format related
				relateds = []
				
				if(related):
					lenght_related_text = len(related_text)
					for index, item in enumerate(related):
						#Save dummy if not on database, if in database get id.
						dummy_series_id = self.dbase.get_spider_item_id(item, 'entity')
						collection_series_id = None
						
						if dummy_series_id == None:
							if index < lenght_related_text:
								dummy_name = util.sanitize_title(related_text[index])
							else:
								dummy_name = None
							#Create dummy
							dummy_series_id = self.dbase.create_entity(dummy_name, self.dbase.entity_type_manga, self.dbase.classification_type_12, language_id, country_id)
							self.dbase.add_spider_item('entity', dummy_series_id, item)
						else:
							#Get collection from database:
							where_values = []
							where_values.append(collection_series_id)
							collection_series_id = self.dbase.get_var('entity', ['collection_id'], "id = %s", where_values)
							
						#print "Dummy" , dummy_series_id
						new_related_type = util.sanitize_content(related_type[index])
						
						if new_related_type:
							new_related_type = new_related_type.replace('(', '')
							new_related_type = new_related_type.replace(')', '')
							related_type_id = self.dbase.add_type(new_related_type, 'based')
						else:
							related_type_id = self.dbase.based_type_sequel_spinoff
						
						new_related = {}
						new_related['id'] = dummy_series_id
						new_related['type_id'] = related_type_id
						new_related['type_name'] = new_related_type
						new_related['collection_id'] = collection_series_id
						relateds.append(new_related)
				
				#Format images. The correct would be the edition have image and not entity. But mangaupdate don't save any related editions. 
				#image = image[0]
				formatted_image = []
				for image in images:
					image_array = image.split('.')
					new_image = {}
					new_image['url'] = image
					new_image['extension'] = image_array.pop()
					new_image_name = image_array.pop()
					new_image_name = new_image_name.split('/')
					new_image['name'] = new_image_name.pop()
					formatted_image.append(new_image)
				
				#Format related Doujinshi
				category_adult = False	
				related_doujin = False
				
				#check if is doujinshi on title
				if(re.search(self.pattern_doujin, romanized_title) != None):
					#if is doujinshi, create a relation of doujinshi type.
					related_doujin = True
					category_adult = True
					#Get original from first part of " dj - "
					original_name = re.sub(self.pattern_remove_doujin, '', romanized_title)
					original_name = util.sanitize_title(original_name)
					if(original_name):
						where_values = []
						where_values.append(original_name)
						original_id = self.dbase.get_var('entity_alias', ['entity_id'], "name = %s", where_values)
					else:
						original_name = 'Unkown name (Cralwer)'
						original_id = None
						
					if not original_id:
						#create dummy:
						original_id = self.dbase.create_entity(original_name, self.dbase.entity_type_manga, self.dbase.classification_type_12, language_id, country_id)
						self.dbase.add_spider_item('entity', original_id, 'Unknown')
						
				#format collection
				collection_id = None
				collection_started = 'False'
						
				#Get a collection from a related item only if type is prequel, sequel or spin-off. 
				if related_doujin:
					#Get collection from original_name. if don't exists create collection.
					collection_id = self.dbase.create_collection(original_name)
				else:
					#if there is related items
					if relateds:
						
						#TODO:
						#Get collection from related items (Get from database because some other spider could make other item related with this).
						#Check if related is sequel, doujinshi or based on. If is there is a collection with the name of this entity. Collection will be the first part of the name.
						#Check if related is prequel, if is the collection is the name of prequel if there inst a collection on the prequel.
						#Update name of collection if there is more than one prequel. check recursive prequel.
						#Check which item started the collection.
						#if none found create collection from most used name.
						#Get name to make a new collection name.
						#self.dbase.get_related_item(self, table, first_field, second_field, relation_type, type_id, entity_id, limit = None)
						
						for item in relateds:
							if item['collection_id']:
								collection_id = item['collection_id']
								break
				
				if not collection_id:
					#Check if name is similar to another collection already registered. Only check if name is larger then 3 characters.
					#This method can have mismatch collection names and collections will need to be check after all items was crawled using get_related_item.
					if(len(romanized_title) > 3):
						series_name = []
						series_name.append(romanized_title)
						collection_id = self.dbase.get_col('collection', 'id', "%s LIKE '%%' || name || '%%'", series_name)
					
						if not collection_id:
							#create new collection with the first name type, get firstname part using regex.
							original_name = re.sub(self.pattern_replace_name,'',romanized_title)
							if not original_name:
								original_name = romanized_title
							collection_id = self.dbase.create_collection(original_name)
						elif(isinstance(collection_id, collections.Iterable) and not isinstance(collection_id, types.StringTypes)):
							#return the element most appear on list
							collections = []
							for new_id in collection_id:
								collections.append(new_id[0])
							
							collection_id = util.most_common_oneliner(collections)
				
				#	Change this to use a relation on database.
				
				#format status
				status = util.sanitize_content(status)
			
				#format animé comparative
				anime_start_end = util.sanitize_content(anime_start_end)

				#format classification_type_id
				if(categories):
					if re.search(ur'[Mm]ature', categories) != None or re.search(ur'[Aa]dults?', categories) != None or re.search(ur'[Hh]entais?', categories) != None or re.search(ur'[Dd]oujin([ -]?shi)?s?', categories) != None or re.search(ur'[Ss]einens?', categories) != None:
						category_adult = True
								
				if(not category_adult):
					classification_type_id = self.dbase.classification_type_12
				else:
					classification_type_id = self.dbase.classification_type_18
Esempio n. 20
0
    def _extract_collections(self):
        """Extracts standard SKOS Collection metadata"""
        collections = []
        # TODO: handle OrderedCollections
        for s in self.G.subjects(predicate=RDF.type, object=SKOS.Collection):
            collections.append(str(s))

        # keeping the OrderedDict ordered
        for c in sorted(collections):
            self.COLLECTIONS[c] = {}

        # fill in each Collection's details from the graph
        for c in self.COLLECTIONS.keys():
            s = URIRef(c)  # for use in Graph() loops

            self.COLLECTIONS[c]["fid"] = None
            self.COLLECTIONS[c]["default_prefLabel"] = None
            self.COLLECTIONS[c]["prefLabels"] = set()
            self.COLLECTIONS[c]["altLabels"] = set()
            self.COLLECTIONS[c]["definitions"] = set()
            self.COLLECTIONS[c]["scopeNotes"] = set()
            self.COLLECTIONS[c]["source"] = None
            self.COLLECTIONS[c]["members"] = set()

            for p, o in self.G.predicate_objects(subject=s):
                if p == SKOS.prefLabel:
                    self.COLLECTIONS[c]["prefLabels"].add((str(o), o.language))  # TODO: add in language
                    if o.language == self.default_language:
                        self.COLLECTIONS[c]["default_prefLabel"] = str(o)

                elif p == SKOS.altLabel:
                    self.COLLECTIONS[c]["altLabels"].add(str(o))  # TODO: add in language

                elif p == SKOS.definition:
                    self.COLLECTIONS[c]["definitions"].add(str(o))  # TODO: add in language

                elif p == SKOS.scopeNote:
                    self.COLLECTIONS[c]["scopeNotes"].add(str(o))  # TODO: add in language

                elif p == DCTERMS.source:
                    self.COLLECTIONS[c]["source"] = str(o)

                elif p == SKOS.topConceptOf:
                    self.COLLECTIONS[c]["topConceptOfs"].add(str(o))

                elif p == SKOS.member:
                    self.COLLECTIONS[c]["members"].add(str(o))
                    # TODO: handle members that are other Collections, not Concepts

            # listify the sets
            self.COLLECTIONS[c]["prefLabels"] = list(self.COLLECTIONS[c]["prefLabels"])
            self.COLLECTIONS[c]["altLabels"] = list(self.COLLECTIONS[c]["altLabels"])
            self.COLLECTIONS[c]["definitions"] = list(self.COLLECTIONS[c]["definitions"])
            self.COLLECTIONS[c]["scopeNotes"] = list(self.COLLECTIONS[c]["scopeNotes"])
            self.COLLECTIONS[c]["members"] = list(self.COLLECTIONS[c]["members"])

            # make fid
            # TODO: update to use default language label, not [0]
            try:
                pl = self.COLLECTIONS[c]["prefLabels"][0][0]

                self.COLLECTIONS[c]["fid"] = self._make_fid(
                    pl, c
                )
            except Exception as e:
                print(e)
                raise Exception("You Collection {}  doesn't have a label but it needs one!".format(c))
		def parse_goods(self, response, media = False, figure = False):
			self.instancialize_database()
			if not self.check_logged(response):
				return self.log_in(response.url)
				
			if media:
				print "Media"
			elif figure:
				print "Figure"
			else:
				print "Goods"
			
			update_id = None
			try:
				#Check if there is a dummy, if there is update only. If there inst the id will be none
				if media:
					#Get update id from entity or from soundtrack or album
					update_id = self.dbase.get_spider_item_id_from_url(response.url)
				else:
					update_id = self.dbase.get_spider_item_id(response.url, 'goods')
			except ValueError as e:
				if media:
					print "Error on getting dummy id on Media", e.message
				else:
					print "Error on getting dummy id on Goods", e.message
			except:
				if media:
					print "Error on getting dummy on Media", sys.exc_info()[0]
				else:
					print "Error on getting dummy on Goods", sys.exc_info()[0]
				util.PrintException()
			
			#Get title
			title = response.css('#wide > h1 span[itemprop="name"]::attr(title)').extract()
			
			#Get details list
			details_list = response.css('ul.sd:nth-child(1) li')
			
			#Get release list
			release_list = response.css('#ref-releases + ul li')
			
			#Get pictures page. Get url from ID. 
			#image_link = response.css('.tab > li:nth-child(6) a::attr(href)').extract()
			main_picture = response.css('.db-picture img::attr(src)').extract()

			#Get tags
			tags = response.css('.tags a:not([title=Information]):not([title=Yes]):not([title=No])::attr(title)').extract()#Exclude No, yes, information
			
			#Get related items
			related_url = response.css('ul.item:nth-child(14) li a::attr(href)').extract()
			related_text = response.css('ul.item:nth-child(14) li a::text').extract()
			related_type = response.css('ul.item:nth-child(14) li em::text').extract()#Only use if related_type and url have the same amount.
			
			#Get observation
			observations = response.css('div.msg::text').extract()
			if not observations:
				observations = response.css('div.msg div::text').extract()
			
			try:
				#Format title
				title = util.sanitize_title(title)
				
				#Format details list
				
				price, id, scale_id, release_date, scale = None, None, None, None, None
				versions_id, categories_id, comments, companies, materials, artists, personas, entities = [], [], [], [], [], [], [], []
				counterfeit, cast_off, r18, region_free = False, False, False, False
				weight, width, length, height = None, None, None, None
				
				
				for item in details_list:
					new_item = item.css('label::text').extract()
					new_item = util.sanitize_title(new_item)
				
					new_content = item.css('div')

					if new_item and new_content:
						#print new_item
						#Check ID
						if new_item == "ID":
							id = new_content.css('::text').extract()
							id = util.sanitize_title(id)
							if id:
								id = id.replace('#','')
							
						#Check price
						elif re.search(self.pattern_price, new_item) != None:
							price = new_content.css('::text').extract()
							price = util.sanitize_title(price)
							if price:
								price = price.replace(ur'¥','')
							#print price
							
						#Check category
						elif re.search(self.pattern_categories, new_item) != None:
							#Figure category: Prepained, Action/Dolls, Trading, Garage Kits, Model Kits, Acessories
							categories = new_content.css('a::text').extract()
							
							for category in categories:
								category_name = util.sanitize_title(category)
								if category_name:
									if media:
										#Save category, return category id.
										category_id = self.dbase.add_name_to_table(category_name.title(), 'category')
										categories_id.append(category_id)
								else:
									categories_id.append(category_name.title())
									
						#Check classification
						elif re.search(self.pattern_classification, new_item) != None:
							classifications = new_content.css('span.trigger > a::text').extract()
							for classification in classifications:
								#save comment on database.
								new_comment = {}
								new_comment['title'] = 'Crawler classification'
								new_comment['content'] = util.sanitize_content(classification)
								comments.append(new_comment)
								
						#Company
						elif re.search(self.pattern_companies, new_item) != None:
							new_companies = new_content.css('span.trigger')
							for company in new_companies:
								new_aliases = []
								company_name = company.css('a:nth-child(1)::text').extract()
								company_name = util.sanitize_title(company_name)
								
								if company_name:
									language_code = langid.classify(company_name)
									language_id = self.dbase.get_language_id_from_code(language_code[0])
									if not language_id:
										language_id = self.dbase.language_ja
									new_name = {}
									new_name['name'] = company_name
									new_name['language_id'] = language_id
									new_aliases.append(new_name)
									
								company_original_name = company.css('a:nth-child(1)::attr(switch)').extract()
								company_original_name = util.sanitize_title(company_original_name)
								
								if company_original_name:
									language_code = langid.classify(company_original_name)
									language_id = self.dbase.get_language_id_from_code(language_code[0])
									if not language_id:
										language_id = self.dbase.language_ja
									new_name = {}
									new_name['name'] = company_original_name
									new_name['language_id'] = language_id
									new_aliases.append(new_name)
								
								company_url = company.css('a:nth-child(1)::attr(href)').extract()
								company_url = company_url[0]
								company_type = company.css('small::text').extract()
								company_type = util.sanitize_title(company_type)
								
								if company_type:
									company_type = re.sub(self.pattern_asps, '', company_type)
									company_type_id = self.dbase.add_type(company_type, 'company_function')
								else:
									company_type_id = self.dbase.company_function_type_creator
									
								company_id = self.dbase.get_spider_item_id(company_url, 'company')
								
								if not company_id:
									#Get company from alias
									where_values = []
									where = []
									for alias in new_aliases:
										where_values.append(alias['name'])
										where.append("name = %s")
									
									where = " or ".join(where)
									company_id = self.dbase.get_var('company_alias',['company_id'], where, where_values)

								if not company_id:
									#Create dummy
									alternate_names = []
									
									if len(new_aliases) > 1:
										alternate_names = new_aliases[1:]
												
									company_id = self.dbase.create_company(new_aliases[0]['name'], self.dbase.language_ja, self.dbase.country_jp, None, None, None, None, None, [], [], [], [], [], [], [], alternate_names)
									self.dbase.add_spider_item('company', company_id, company_url)
								else:
									#Add alias
									for alias in new_aliases:
										self.dbase.add_alias(alias['name'], company_id, alias['language_id'], 'company', self.dbase.alias_type_alias)
										
								new_company = {}
								new_company['id'] = company_id
								new_company['function_type_id'] = company_type_id
								companies.append(new_company) 

						#Character
						elif re.search(self.pattern_character, new_item):
							anchor_texts = new_content.css('span.trigger > a::text').extract()
							anchor_urls = new_content.css('span.trigger > a::attr(href)').extract()
							anchor_switchs = new_content.css('span.trigger > a::attr(switch)').extract()#original name
						
							for index, anchor_text in enumerate(anchor_texts):
								aliases = []
								anchor_text = util.sanitize_title(anchor_text)
								if anchor_text:
									aliases.append(anchor_text)
								anchor_alias = util.sanitize_title(anchor_switchs[index])
								if anchor_alias:
									aliases.append(anchor_alias)
								#Get id from spider_item
								persona_id = self.dbase.get_spider_item_id(anchor_urls[index], 'persona')
								if aliases:
									persona = {}
									persona['alias'] = aliases
									persona['id'] = persona_id
									personas.append(persona)

						#Origin
						elif re.search(self.pattern_origin, new_item):
							anchor_texts = new_content.css('span.trigger > a::text').extract()
							anchor_urls = new_content.css('span.trigger > a::attr(href)').extract()
							anchor_switchs = new_content.css('span.trigger > a::attr(switch)').extract()#original name
						
							for index, anchor_text in enumerate(anchor_texts):
								aliases = []
								#Get id from spider_item
								entity_id = self.dbase.get_spider_item_id_from_url(anchor_urls[index])
								anchor_text = util.sanitize_title(anchor_text)
								if anchor_text:
									aliases.append(anchor_text)
									
								anchor_alias = util.sanitize_title(anchor_switchs[index])
								if anchor_alias:
									aliases.append(anchor_alias)
								if aliases:
									entity = {}
									entity['alias'] = aliases
									entity['id'] = entity_id
									entities.append(entity)
							
						#Check Dimensions
						elif re.search(self.pattern_dimensions, new_item):
							dimensions = new_content.css('::text').extract()
							dimensions = util.sanitize_title(dimensions)
							if dimensions:
								new_dimensions = re.sub(self.pattern_inside_paren, '--', dimensions)
								dimen = new_dimensions.split('--')
								for d in dimen:
									multi = 1
									if re.search(self.pattern_m, d) != None:
										multi = 1000
									elif re.search(self.pattern_cm, d) != None:
										multi = 10
									
									new_d = re.sub(self.pattern_dimension, '', d)
									if re.search(self.pattern_w, new_d) != None:
										width = re.sub(self.pattern_alpha,'', new_d)
										width = util.convert_to_number(width) * multi
									elif re.search(self.pattern_h, new_d) != None:
										height = re.sub(self.pattern_alpha,'', new_d)
										height = util.convert_to_number(height) * multi
									elif re.search(self.pattern_l, new_d) != None:
										length = re.sub(self.pattern_alpha,'', new_d)
										length = util.convert_to_number(length) * multi
								#Register dimensions as comment.
								new_comment = {}
								new_comment['title'] = 'Crawler item dimensions'
								new_comment['content'] = dimensions
								comments.append(new_comment)
							
						#check version
						elif re.search(self.pattern_version, new_item) != None:
							versions = new_content.css('span.trigger > a::text').extract()
							for version in versions:
								version_id = util.sanitize_title(version)
								if version_id:
									if re.search(ur'[Rr]18', version_id) != None:
										r18 = True
									version_id = re.sub(ur'[Vv]er.?$', '', version_id)
									version_id = self.dbase.add_name_to_table(version_id.title(), 'goods_version')
									versions_id.append(version_id)
									
						#Check release date
						elif re.search(self.pattern_release, new_item) != None:
							release_date = new_content.css('a::text').extract()
							if release_date:
								release_date = release_date[0]
								
						#Scale
						elif new_item == "Scale":
							scale = new_content.css('span.trigger > a::text').extract()
							scale = util.sanitize_title(scale)
							if scale:
								scale_id = self.dbase.add_name_to_table(scale, 'scale')
								
						#Materials
						elif new_item == "Material":
							material = new_content.css('span.trigger > a::text').extract()
							material = util.sanitize_title(material)
							if material:
								material_id = self.dbase.add_name_to_table(material, 'material')
								materials.append(material_id)
								
						#Artist (People)
						elif re.search(self.pattern_artist, new_item):
							new_artists = new_content.css('span.trigger')
							for artist in new_artists:
								new_aliases = []
								artist_name = artist.css('a:nth-child(1)::text').extract()
								artist_name = util.sanitize_title(artist_name)
								
								if artist_name:
									new_name = util.get_formatted_name(artist_name)
									if new_name:
										new_aliases.append(new_name)
									
								artist_original_name = artist.css('a:nth-child(1)::attr(switch)').extract()
								artist_original_name = util.sanitize_title(artist_original_name)
								
								if artist_original_name:
									new_name = util.get_formatted_name(artist_original_name)
									if new_name:
										new_aliases.append(new_name)
								
								artist_url = artist.css('a:nth-child(1)::attr(href)').extract()
								artist_url = artist_url[0]
								artist_type = artist.css('small::text').extract()
								artist_type = util.sanitize_title(artist_type)
								
								if artist_type:
									artist_type = re.sub(self.pattern_asps, '', artist_type)
									artist_type_id = self.dbase.add_type(artist_type, 'create')
								else:
									artist_type_id = self.dbase.people_create_type_sculptor
									
								artist_id = self.dbase.get_spider_item_id(artist_url, 'people')
								
								if not artist_id:
									#Get artist from alias
									where_values = []
									where = []
									for alias in new_aliases:
										where_values.append(alias['name'])
										where_values.append(alias['lastname'])
										where.append("name = %s")
										where.append("lastname = %s")
									
									where = " or ".join(where)
									artist_id = self.dbase.get_var('people_alias',['people_id'], where, where_values)

								if not artist_id:
									#Create dummy
									alternate_names = []
									
									if len(new_aliases) > 1:
										alternate_names = new_aliases[1:]
										
									artist_id = self.dbase.create_people(new_aliases[0]['name'], new_aliases[0]['lastname'], self.dbase.country_jp, None, None, None, None, None, None, None, alternate_names)
									self.dbase.add_spider_item('people', artist_id, artist_url)
								else:
									#Add alias
									for alias in new_aliases:
										self.dbase.add_people_alias(alias['name'], alias['lastname'], artist_id, self.dbase.alias_type_alias)
								
								where_values = []
								where_values.append(artist_id)
								where_values.append(new_aliases[0]['name'])
								where_values.append(new_aliases[0]['lastname'])
								
								artist_alias_used = self.dbase.get_var('people_alias', ['id'], "people_id = %s and name = %s and lastname = %s", where_values)
								
								new_artist = {}
								new_artist['id'] = artist_id
								new_artist['alias_id'] = artist_alias_used
								new_artist['function_type_id'] = artist_type_id
								artists.append(new_artist)
								
						#Various. Types: 2140g, Counterfeit; 14 tracks, 1 disc, 01:17:00; 1 disc; Region-free; Cast off;
						elif new_item == "Various":
							content = ""
							
							text = new_content.css('span.trigger > a span::text').extract()
							text = util.sanitize_content(text)
							if text:
								content = text
								
							text = new_content.css('span::text').extract()
							text = util.sanitize_content(text)
							if text:
								content = content + " " + text
								
							text = new_content.css('a::text').extract()
							text = util.sanitize_content(text)
							if text:
								content = content + " " + text
								
							text = new_content.css('::text').extract()
							text = util.sanitize_content(text)
							if text:
								content = content + " " + text
								
							if content:
								if re.search(self.pattern_counterfeit, content) != None:
									counterfeit = True
									
								if re.search(self.pattern_cast_off, content) != None:
									cast_off = True
									
								if re.search(self.pattern_r18, content) != None:
									r18 = True
									
								#Check if is region free
								if re.search(self.pattern_free_region, content) != None:
									region_free = True
									
								new_comment = {}
								new_comment['title'] = 'Crawler new_item various'
								new_comment['content'] = content
								comments.append(new_comment)

				entities_origin_id = []
				persona_origin_id = []
				
				#Format character and origin
				for entity in entities: #Format first entities
					if entity['id']:
						entities_origin_id.append(entity['id'])
					else:
						#Check if there is a entity origin with the same name, else create dummy.
						where, where_values = [], []
						
						for alias in entity['alias']:
							where.append("name = %s")
							where_values.append(alias)
						where = " or ".join(where)
						return_items = self.dbase.get_col('entity_alias','entity_id', where, where_values)
						if return_items:
							if len(return_items) > 1:
								for return_item in return_items:
									new_comment = {}
									new_comment['title'] = 'Cralwer Not Associated Entity'
									new_comment['content'] = return_item[0]
									comments.append(new_comment)
							else:
								entities_origin_id.append(return_items[0][0])
						else:
							#Create dummy.
							entity_dummy_id = self.dbase.create_entity(entity['alias'][0], self.dbase.entity_type_anime, self.dbase.classification_type_12, self.dbase.language_ja, self.dbase.country_jp)
							entities_origin_id.append(entity_dummy_id)
							
				for persona in personas:
					if persona['id']:
						persona_origin_id.append(persona['id'])
					else:
						#Check if there is a entity origin with the same name, else create dummy.
						persona_name = util.get_formatted_name(persona['alias'])
						if persona_name:
							where_values = []
							where_values.append(persona_name['name'])
							where_values.append(persona_name['lastname'])
							return_items = self.dbase.get_col('persona_alias','persona_id', "name = %s and last_name = %s", where_values)
						if not return_items:
							persona_name = util.get_formatted_name(persona['alias'], True)
							where_values = []
							where_values.append(persona_name['name'])
							where_values.append(persona_name['lastname'])
							return_items = self.dbase.get_col('persona_alias','persona_id', "name = %s and last_name = %s", where_values)
							
						if return_items:
							if len(return_items) > 1:
								for return_item in return_items:
									new_comment = {}
									new_comment['title'] = 'Cralwer Not Associated Persona'
									new_comment['content'] = return_item[0]
									comments.append(new_comment)
							else:
								persona_origin_id.append(return_items[0][0])
						else:
							persona_name = util.get_formatted_name(persona['alias'])
							#Create dummy.
							entity_dummy_id = self.dbase.create_persona(persona_name['name'], persona_name['lastname'], 'Undefined')
							persona_origin_id.append(entity_dummy_id)	
							
				#If there is more than one dont associate.
				'''
				#Format relationship between good and entity.
				if len() == 1 or len() == 1:
					#make a relation between character and entity.
					
				else:
					#Dont make relationship, add comment.
					new_comment = {}
					new_comment['title'] = 'Cralwer Could not associated Persona'
					new_comment['content'] = return_item
					comments.append(new_comment)
				
				#Make relationship between entities
				entity_length = len(entities)
				'''
				
				if entities_origin_id:
					new_comment = {}
					new_comment['title'] = 'Cralwer Could not associated to entities'
					new_comment['content'] = util.sanitize_title(entities_origin_id)
					comments.append(new_comment)
				
				#Format picture link
				new_images = []
				for image in main_picture:
					image = re.sub(ur'\?.*','', image)
					image = re.sub(ur'\bbig\b/','large/', image)
					image_array = image.split('.')
					new_image = {}
					new_image['url'] = image
					new_image['extension'] = image_array.pop()
					new_image_name = image_array.pop()
					new_image_name = new_image_name.split('/')
					new_image['name'] = new_image_name.pop()
					new_image['image_type_id'] = self.dbase.image_good_type_main
					new_images.append(new_image)
				
				#Format tags
				tags_id = []
				for tag in tags:
					tag_id = util.get_formatted_tag(tag)
					if tag_id:
						tag_id = self.dbase.add_name_to_table(tag, 'tag')
						tags_id.append(tag_id)

					
				#Format release list
				#format release country and currency.release_date
				launch_countries = []
				for release in release_list:

					date = release.css('div:nth-child(1) .time::text').extract()
					date = util.sanitize_title(date)
					date = util.get_formatted_date(date)
					
					launch_type = release.css('div:nth-child(2) em::text').extract()
					if launch_type:
						launch_type = util.sanitize_title(launch_type[0])
						launch_type = self.dbase.add_name_to_table(launch_type, 'launch_type')
					if not launch_type:
						launch_type = self.dbase.add_name_to_table('Standard', 'launch_type')
					
					price = release.css('div:nth-child(4)::text').extract()
					price = util.sanitize_title(price)
					if price:
						price = price.replace(ur'¥', '')
					
					if not price:
						price = 0
					
					new_launch = {}
					new_launch['country_id'] = self.dbase.country_jp
					new_launch['date'] = date
					new_launch['price'] = price
					new_launch['currency_id'] = self.dbase.currency_yen
					new_launch['launch_type_id'] = launch_type
			
					#Get event
					location = util.sanitize_title(release.css('div:nth-child(3) span::attr(title)').extract())
					if location:
						new_comment = {}
						new_comment['title'] = 'Crawled location launch'
						new_comment['content'] = location + " type: " + str(launch_type) + ", " + date
						comments.append(new_comment)
					
					launch_countries.append(new_launch)
				
				#Format collection
				collection_id = None
				#Get collection from entities of origin.
				for origin in entities_origin_id:
					where_values = []
					where_values.append(origin)
					collection_id = self.dbase.get_var('entity', ['collection_id'], "id = %s", where_values)
					if collection_id:
						break			
				
				#Get collection same name as origin
				entities_origin_name = []
				if not collection_id:
					for item in entities:
						for alias in item['alias']:
							new_alias = util.normalize_collection_name(alias)
							if new_alias:
								where_values = []
								where_values.append(new_alias)
								collection_id = self.dbase.get_var('collection_alias', ['collection_id'], "name = %s", where_values)
								if not collection_id:
									entities_origin_name.append(new_alias)
								else:
									break
						if collection_id:
							break

				#Get collection name from similar collections, from the title:
				if not collection_id and len(title) > 3:
					#Check if name is similar to another collection already registered. Only check if name is larger then 3 characters.
					#This method can have mismatch collection names and collections will need to be check after all items was crawled using get_related_item.
					series_name = []
					series_name.append(title)
					collection_id = self.dbase.get_col('collection_alias', 'collection_id', "%s LIKE '%%' || name || '%%'", series_name)
					
					if not collection_id:
						collection_id = self.dbase.get_col('collection_alias', 'collection_id', "name LIKE '%%' || %s || '%%'", series_name)
						if not collection_id:
							#create new collection with the first name type, get firstname part using regex.
							original_name = re.sub(self.pattern_replace_name,'',title)
							if original_name:
								collection_id = self.dbase.create_collection(original_name)
					
					if(isinstance(collection_id, collections.Iterable) and not isinstance(collection_id, types.StringTypes)):
						#return the element most appear on list
						collections = []
						for new_id in collection_id:
							collections.append(new_id[0])
							
						collection_id = util.most_common_oneliner(collections)
				
				#Else create collection from origin name. If there is no origin name get collection from classification.
				if not collection_id:
					if entities_origin_name:
						collection_id = self.dbase.create_collection(entities_origin_name[0])
						
				#Format classification
				if cast_off or r18:
					classification_type_id = self.dbase.classification_type_18
				else:
					classification_type_id = self.dbase.classification_type_16
				
				#Format observation
				observation = util.sanitize_content(observations)

				if not media:
					#Format good type
					if categories_id:
						goods_type_id = self.dbase.add_type(categories_id[0], 'goods')
					else:
						goods_type_id = self.dbase.add_type('Unkown', 'goods')
				
				#Format figure itens
				if figure:
					#Format scale
					if not scale:	
						scale_id = self.dbase.scale_non_scale
					else:
						scale_id = self.dbase.add_name_to_table(scale, 'scale')
				

				#Format related items
				relations = []
				for index, type in enumerate(related_type):
					#Get id, else create dummy.
					if media:
						table = 'entity'
					else:
						table = 'goods'
						
					relation_id = self.dbase.get_spider_item_id(related_url[index], table)
					if not relation_id:
						#Create dummy
						if media:
							relation_id = self.dbase.create_entity(None, entity_type_id, classification_type_id, self.dbase.language_ja, self.dbase.country_jp)
						else:
							relation_id = self.dbase.create_goods(None, self.dbase.language_ja, goods_type_id, None, collection_id)
						
						self.dbase.add_spider_item(table, relation_id, related_url[index])
					
					type = util.sanitize_title(type)
					type_id = self.dbase.add_type(type.title(),'associated')
					new_relation = {}
					new_relation['id'] = relation_id
					new_relation['type_id'] = type_id
					relations.append(new_relation)
					
				#Format Draft.
				if "This entry is a draft" in response.body:
					draft = True
				else:
					draft = False
Esempio n. 22
0
    def parse_series(self, response, franchise=False):
        self.instancialize_database()

        if franchise:
            print "Franchise"
        else:
            print "Series"

        update_id = None
        try:
            #Check if there is a dummy, if there is update only. If there inst the id will be none
            if franchise:
                update_id = self.dbase.get_spider_item_id(
                    response.url, 'collection')
            else:
                update_id = self.dbase.get_spider_item_id(
                    response.url, 'entity')
        except ValueError as e:
            if franchise:
                print "Error on getting dummy id on Franchise", e.message
            else:
                print "Error on getting dummy id on Series", e.message
        except:
            if franchise:
                print "Error on getting dummy on Franchise", sys.exc_info()[0]
            else:
                print "Error on getting dummy on Series", sys.exc_info()[0]
            util.PrintException()

        #Get search title/collection name
        series_title = response.css(
            'div.middleframe:nth-child(3) > h1:nth-child(1)::text').extract()

        #Get series ID

        #Get details table.
        table_details = response.css('#bt tr')

        #Get english title
        english_title = table_details[1].css('td:nth-child(2)::text').extract()

        #Get romaji title
        romanji_title = table_details[2].css('td:nth-child(2)::text').extract()

        #Get furigana title
        furigana_title = table_details[3].css(
            'td:nth-child(2)::text').extract()

        #Get japanese title
        japanese_title = table_details[4].css(
            'td:nth-child(2)::text').extract()

        #Get synopse/Description
        synopsis = response.css(
            'div.middleframe:nth-child(3) > div:nth-child(2) #besttable')
        synopsis = synopsis[2].css('::text').extract()

        #Get type
        type = response.css(
            'div.middleframe:nth-child(3) > div:nth-child(2) > h3:nth-child(6)::text'
        ).extract()
        if not type:
            type = response.css(
                'div.middleframe:nth-child(3) > div:nth-child(2) > h3:nth-child(7)::text'
            ).extract()
        if not type:
            util.Log(response.url, "Verificar type dessa entitdade.", False)
            type = response.css(
                'div.middleframe:nth-child(3) > div:nth-child(2) > h3:nth-child(8)::text'
            ).extract()

        if not franchise:
            #Get related work
            related_url = response.css(
                '#tile > ul:nth-child(1) li a::attr(href)').extract()

            #Collection name
            notice = response.css('.notice_inner::text').extract()
            notice_name = response.css(
                '.notice_inner > a:nth-child(2)::text').extract()
            notice_url = response.css(
                '.notice_inner > a:nth-child(2)::attr(href)').extract()

            #Get images
            images = response.css(
                'div.middleframe:nth-child(6) > div:nth-child(2) img::attr(src)'
            ).extract()
            images_url = response.css(
                'div.middleframe:nth-child(6) > div:nth-child(2) a::attr(href)'
            ).extract()

            front_image = response.css(
                '.vector table tr > td:nth-child(4) > a:nth-child(1) > img:nth-child(1)::attr(src)'
            ).extract()
        else:
            #Get associated entities
            entities_url = response.css(
                'div.middleframe:nth-child(3) > div:nth-child(2) > ul:nth-child(14) li a::text'
            ).extract()
            entities_text = response.css(
                'div.middleframe:nth-child(3) > div:nth-child(2) > ul:nth-child(14) li a::text'
            ).extract()

        try:
            series_title = util.sanitize_title(series_title)

            series_title = re.sub(self.pattern_replace_name, '', series_title)

            first = True
            aliases = []

            new_search = series_title.split('/')
            for part in new_search:
                if first:
                    series_title = util.sanitize_title(part)
                    first = False
                else:
                    if part:
                        new_title = {}
                        new_title['title'] = util.sanitize_title(part)
                        new_title['language_id'] = self.dbase.language_en
                        aliases.append(new_title)

            #Format type. get from series_title, types avaliable: Franchise, Light Novel, Manga, Anime, Visual Novel, H-Game, OVA, ONA - Original Net Animation, Video Game, Movie, Drama CD
            type = util.sanitize_title(type)
            type_name = re.sub(self.pattern_parenthisis_right, '', type)
            type_name = re.sub(self.pattern_parenthisis_left, '', type_name)

            if "H-Game" in type_name:
                type_id = self.dbase.entity_type_erogame
            elif "OVA" in type_name:
                type_id = self.dbase.entity_type_ova
            elif "Movie" in type_name:
                type_id = self.dbase.entity_type_anime_movie
            else:
                type_id = self.dbase.add_type(type_name, 'entity')

            if update_id == None:
                #if is manga, light novel, or book check if there is
                if type_id == self.dbase.entity_type_manga or type_id == self.dbase.entity_type_lightnovel or type_id == self.dbase.entity_type_manhaw or type_id == self.dbase.entity_type_manhua:
                    new_name = []
                    if type_id == self.dbase.entity_type_lightnovel or type_id == self.dbase.entity_type_webnovel:
                        new_search_title = series_title + " (Novel)"
                    new_name.append(new_search_title)
                    update_id = self.dbase.get_var('entity_alias',
                                                   ['entity_id'], "name = %s",
                                                   new_name)

            #Format alias. Separe alias by /
            english_title = util.sanitize_title(english_title)
            if english_title:
                new_nme = english_title.split('/')
                for new in new_nme:
                    new = util.sanitize_title(new)
                    if new:
                        new_title = {}
                        new_title['title'] = new
                        new_title['language_id'] = self.dbase.language_en
                        aliases.append(new_title)

            romanji_title = util.sanitize_title(romanji_title)
            if romanji_title:
                new_nme = romanji_title.split('/')
                for new in new_nme:
                    new = util.sanitize_title(new)
                    if new:
                        new_title = {}
                        new_title['title'] = new
                        new_title['language_id'] = self.dbase.language_ja
                        aliases.append(new_title)

            furigana_title = util.sanitize_title(furigana_title)
            if furigana_title:
                new_nme = furigana_title.split('/')
                for new in new_nme:
                    new = util.sanitize_title(new)
                    if new:
                        new_title = {}
                        new_title['title'] = new
                        new_title['language_id'] = self.dbase.language_ja
                        aliases.append(new_title)

            japanese_title = util.sanitize_title(japanese_title)
            if japanese_title:
                new_nme = japanese_title.split('/')
                for new in new_nme:
                    new = util.sanitize_title(new)
                    if new:
                        new_title = {}
                        new_title['title'] = new
                        new_title['language_id'] = self.dbase.language_ja
                        aliases.append(new_title)

            comments = []

            #Format table details
            #episodes = 0
            #ova_episodes = 0
            release_date, origin_entity_id, origin_type_id, origin_type = None, None, None, None
            genres_id = []
            aliases_company, companies, peoples, wikies = [], [], [], []
            found_origin = False
            classification_type_id = self.dbase.classification_type_12

            for item in table_details[5:]:
                new_item = util.sanitize_title(item.css('th::text').extract())
                if not new_item:
                    new_item = util.sanitize_title(
                        item.css('th a::text').extract())

                new_content_url_text = item.css('td a::text').extract()
                new_content_url = item.css('td a::attr(href)').extract()
                new_content_text = item.css('td::text').extract()

                if new_item and (new_content_url_text or new_content_text):
                    #print new_item

                    #Check release date
                    if new_item == "Release Date":
                        release_date = util.sanitize_title(
                            new_content_url_text)
                        if not release_date:
                            release_date = util.sanitize_title(
                                new_content_text)

                    #Check studios
                    elif "Studio Name" in new_item:
                        if new_item == "English Studio Name" or new_item == "Japanese Studio Name":
                            if new_item == "Japanese Studio Name":
                                language_company = self.dbase.language_ja
                            else:
                                language_company = self.dbase.language_en

                            for index, url in enumerate(new_content_url):
                                company_name = util.sanitize_title(
                                    new_content_url_text[index])
                                if company_name:
                                    new_alias = {}
                                    new_alias['url'] = re.sub(
                                        self.pattern_language, '',
                                        self.get_formatted_link(url))
                                    new_alias['name'] = company_name
                                    new_alias['language_id'] = language_company
                                    aliases_company.append(new_alias)

                    #Check publisher and developer.
                    elif re.search(self.pattern_companies, new_item) != None:
                        #Get company id from alias.
                        company_name = util.sanitize_title(
                            new_content_url_text)
                        if not company_name:
                            company_name = util.sanitize_title(
                                new_content_text)

                        if company_name:
                            #Get function type:
                            function_type = self.dbase.add_type(
                                new_item.title(), 'company_function')

                            new_company_name = company_name.split(',')
                            for company_name in new_company_name:
                                company_id = None
                                if company_name:

                                    where_values = []
                                    where_values.append(company_name)
                                    company_id = self.dbase.get_var(
                                        'company_alias', ['company_id'],
                                        "name = %s", where_values)
                                    #Get relation_type
                                    if not company_id:
                                        #create dummy.
                                        company_id = self.dbase.create_company(
                                            company_name,
                                            self.dbase.language_ja,
                                            self.dbase.country_jp)

                                    company = {}
                                    company['id'] = company_id
                                    company['function_type_id'] = function_type
                                    if not franchise:
                                        companies.append(company)
                                    else:
                                        companies.append(company_id)

                    #Check ratings
                    elif new_item == "Content Rating":
                        #Format rating
                        ratings = util.sanitize_title(new_content_text)
                        if not ratings:
                            ratings = util.sanitize_title(new_content_url_text)
                            if ratings:
                                if "Mature" in ratings:
                                    classification_type_id = self.dbase.classification_type_17
                                elif "Everyone" in ratings:
                                    classification_type_id = self.dbase.classification_type_free
                                elif "Child" in ratings:
                                    classification_type_id = self.dbase.classification_type_3
                                elif "10+" in ratings:
                                    classification_type_id = self.dbase.classification_type_10
                                elif "Teen" in ratings:
                                    classification_type_id = self.dbase.classification_type_13
                                elif "Adult" in ratings:
                                    classification_type_id = self.dbase.classification_type_18

                    #Check genre
                    elif new_item == "Genre Tags":
                        #Format genre
                        genres = new_content_url_text
                        for genre in genres:
                            new_genre = util.sanitize_title(genre)
                            if new_genre:
                                #Create genre
                                genre_id = self.dbase.add_name_to_table(
                                    new_genre.title(), 'genre')
                                #if genre_id: Dont need to check, if not save will raise a valueerror.
                                new_genre = {}
                                new_genre['id'] = genre_id
                                genres_id.append(new_genre)

                    #Check links
                    elif new_item == "Links":
                        #Format wikis
                        for index, link in enumerate(new_content_url):

                            if re.search(self.pattern_jp, link) != None:
                                link_language = self.dbase.language_ja
                            elif re.search(self.pattern_pt, link) != None:
                                link_language = self.dbase.language_pt
                            else:
                                link_language = self.dbase.language_en

                            wiki = {}
                            wiki['name'] = new_content_url_text[index]
                            wiki['url'] = link
                            wiki['language_id'] = link_language
                            wikies.append(wiki)

                    #Check episodes number (Epidoses, OVA)
                    elif re.search(self.pattern_episodes, new_item) != None:
                        #check if OVA
                        #if re.search(ur'\b[OovVAa]{3}\b', new_item) != None:
                        #else:
                        content = util.sanitize_title(new_content_url_text)
                        if not content:
                            content = util.sanitize_title(new_content_text)

                        if content:
                            comment = {}
                            comment['title'] = 'Cralwer Episodes Number'
                            comment['content'] = content
                            comments.append(comment)

                    #Check origin
                    elif re.search(self.pattern_origin, new_item) != None:
                        #If Origin in manga
                        origin_type = util.sanitize_title(new_content_text)
                        if not origin_type:
                            origin_type = util.sanitize_title(
                                new_content_url_text)

                        if origin_type:
                            where_values = []
                            where_values.append(origin_type.title())
                            origin_type_id = self.dbase.get_var(
                                'entity_type', ['id'], "name = %s",
                                where_values)
                            if origin_type_id:
                                found_origin = True
                                where_values = []
                                where_values.append(origin_type_id)
                                where_values.append(series_title)
                                where_values.append(series_title)
                                origin_entity_id = self.dbase.get_var(
                                    'entity', ['entity.id'],
                                    "entity.entity_type_id = %s and (%s like '%%' || entity_alias.name || '%%' or entity_alias.name = %s)",
                                    where_values, ['entity_alias'],
                                    ["entity_alias.entity_id = entity.id"])

                        if not found_origin:
                            util.Log(response.url, "not found origin type",
                                     False)

                    #Check people (Director, Author, Artist, Writer, Composer, ADR Director, Character Design, Illustrator, Scenario
                    elif re.search(self.pattern_people, new_item) != None:
                        #Get people id from alias.
                        people_name = util.sanitize_title(new_content_url_text)
                        if not people_name:
                            people_name = util.sanitize_title(new_content_text)

                        if people_name:
                            new_people_name = people_name.split(',')
                            for people_name in new_people_name:
                                people_id, alias_used_id = None, None
                                people_name = util.get_formatted_name(
                                    people_name, True)
                                if people_name:
                                    #Get relation type
                                    relation_type_id = self.dbase.add_type(
                                        new_item.title(), 'produces')

                                    where_values = []
                                    where_values.append(people_name['name'])
                                    where_values.append(
                                        people_name['lastname'])
                                    alias_used_id = self.dbase.get_var(
                                        'people_alias', ['id'],
                                        "name = %s and lastname = %s",
                                        where_values)
                                    #Get relation_type
                                    if not alias_used_id:
                                        #create dummy.
                                        people_id = self.dbase.create_people(
                                            people_name['name'],
                                            people_name['lastname'],
                                            self.dbase.country_jp)
                                        #Get alias.
                                        where_values = []
                                        where_values.append(people_id)
                                        alias_used_id = self.dbase.get_var(
                                            'people_alias', ['id'],
                                            "people_id = %s", where_values)
                                    else:
                                        #get people_id
                                        where_values = []
                                        where_values.append(alias_used_id)
                                        people_id = self.dbase.get_var(
                                            'people_alias', ['people_id'],
                                            "id = %s", where_values)

                                    people = {}
                                    people['id'] = people_id
                                    people['alias_used_id'] = alias_used_id
                                    people[
                                        'relation_type_id'] = relation_type_id
                                    peoples.append(
                                        people
                                    )  #There is no plural for multiple individual but, I don`t care.

                    #Check twitter
                    elif re.search(self.pattern_twitter, new_item) != None:
                        #Add twitter as comment.
                        comment = {}
                        comment['title'] = 'Cralwer Twitter new_item'
                        comment['content'] = new_content_url
                        comments.append(comment)

                    else:
                        content = util.sanitize_content(new_content_url_text)
                        if not content:
                            content = util.sanitize_content(new_content_text)
                        else:
                            second_content = util.sanitize_content(
                                new_content_text)
                            if second_content:
                                content = content + '\n' + second_content
                        if content:
                            #save comment
                            comment = {}
                            comment[
                                'title'] = 'Cralwer unknown new_item' + new_item
                            comment['content'] = content
                            comments.append(comment)

            #Format companies creator
            #Get company id from spider_item or alias
            for item in aliases_company:
                company_update_id = self.dbase.get_spider_item_id(
                    self.get_formatted_link(item['url']), 'company')
                if not company_update_id:
                    where_values = []
                    where_values.append(item['name'])
                    company_update_id = self.dbase.get_var(
                        'company_alias', ['company_id'], "name = %s",
                        where_values)

                alternate_names = []
                if not company_update_id:
                    company_current_alias = item['name']
                else:
                    company_current_alias = None
                    new_alias = {}
                    new_alias['name'] = item['name']
                    new_alias['language_id'] = item['language_id']
                    alternate_names.append(new_alias)

                #Create dummy. This method return the company ID and add new aliases if the company already exists. The new alias will be named as romanized type.
                company_id = self.dbase.create_company(
                    company_current_alias, item['language_id'],
                    self.dbase.country_jp, None, None, None, None, None, [],
                    [], [], [], [], [], [], alternate_names, company_update_id)
                #Don't need to check if company_id is True because a error is raise if not True.
                new_company = {}
                new_company['id'] = company_id
                new_company[
                    'function_type_id'] = self.dbase.company_function_type_creator
                if not franchise:
                    companies.append(new_company)
                else:
                    companies.append(company_id)
                self.dbase.add_spider_item('company', company_id, item['url'],
                                           False)

            if not franchise:

                #Format related work
                relateds = []
                for url in related_url:
                    #Check if already registered, else create dummy without name.
                    new_url = self.get_formatted_link(url)
                    related_id = self.dbase.get_spider_item_id(
                        new_url, 'entity')
                    if not related_id:
                        #create dummy.
                        related_id = self.dbase.create_entity(
                            None, type_id, self.dbase.classification_type_12,
                            self.dbase.language_ja, self.dbase.country_jp)
                    self.dbase.add_spider_item('entity', related_id, new_url)
                    new_related = {}
                    new_related['id'] = related_id
                    new_related['type_id'] = self.dbase.based_type_sequel

                    relateds.append(new_related)

                #Format images
                new_images = []

                for image in images:
                    image_array = self.get_formatted_link(image).split('.')
                    new_image = {}
                    new_image['url'] = self.get_formatted_link(image)
                    new_image['extension'] = image_array.pop()
                    new_image_name = image_array.pop()
                    new_image_name = new_image_name.split('/')
                    new_image['name'] = new_image_name.pop()
                    new_images.append(new_image)

                for image in front_image:
                    image_array = self.get_formatted_link(image).split('.')
                    new_image = {}
                    new_image['url'] = self.get_formatted_link(image)
                    new_image['extension'] = image_array.pop()
                    new_image_name = image_array.pop()
                    new_image_name = new_image_name.split('/')
                    new_image['name'] = new_image_name.pop()
                    new_images.append(new_image)

                #Format collection
                collection_id = None
                collection_started = 'False'
                if "extends" in notice:
                    #Get collection from spider item
                    if notice_url:
                        new_url_collection = self.get_formatted_link(
                            notice_url[0])
                        collection_id = self.dbase.get_spider_item_id(
                            new_url_collection, 'new_url_collection')

                    if not collection_id and notice_name:
                        notice_name = util.sanitize_title(notice_name)
                        notice_name = re.sub(self.pattern_replace_name, '',
                                             notice_name)
                        where_values = []
                        where_values.append(notice_name)
                        collection_id = self.dbase.get_col(
                            'collection', 'id', "%s LIKE '%%' || name || '%%'",
                            where_values)

                        if not collection:
                            #create collection
                            collection_name = util.normalize_collection_name(
                                util.normalize_collection_name(notice_name))
                            collection_id = self.dbase.create_collection(
                                collection_name)
                            if new_url_collection:
                                add_spider_item('collection', collection_id,
                                                new_url_collection)
                        elif (isinstance(collection_id, collections.Iterable)
                              and not isinstance(collection_id,
                                                 types.StringTypes)):
                            #return the element most appear on list
                            collections = []
                            for new_id in collection_id:
                                collections.append(new_id[0])
                            collection_id = util.most_common_oneliner(
                                collections)

                if not collection_id:
                    #Check if name is similar to another collection already registered. Only check if name is larger then 3 characters.
                    #This method can have mismatch collection names and collections will need to be check after all items was crawled using get_related_item.
                    if (len(series_title) > 3):
                        series_name = []
                        series_name.append(series_title)
                        collection_id = self.dbase.get_col(
                            'collection', 'id', "%s LIKE '%%' || name || '%%'",
                            series_name)

                        if not collection_id:
                            #create new collection with the first name type, get firstname part using regex.
                            original_name = re.sub(self.pattern_replace_name,
                                                   '', series_title)

                            if not original_name:
                                original_name = series_title
                            original_name = util.normalize_collection_name(
                                original_name)
                            collection_id = self.dbase.create_collection(
                                original_name)
                        elif (isinstance(collection_id, collections.Iterable)
                              and not isinstance(collection_id,
                                                 types.StringTypes)):
                            #return the element most appear on list
                            collections = []
                            for new_id in collection_id:
                                collections.append(new_id[0])
                            collection_id = util.most_common_oneliner(
                                collections)

                #Format language and country
                if type_id == self.dbase.entity_type_manhaw:
                    language_id = self.dbase.language_ko
                    country_id = self.dbase.country_kr
                elif type_id == self.dbase.entity_type_manhua:
                    language_id = self.dbase.language_zh
                    country_id = self.dbase.country_cn
                else:
                    #Format language
                    language_id = self.dbase.language_ja
                    #Format country
                    country_id = self.dbase.country_jp

                #Format classification
                if type_id == self.dbase.entity_type_erogame and classification_type_id != self.dbase.classification_type_18:
                    classification_type_id = self.dbase.classification_type_18

                #Format origin
                if not origin_entity_id and found_origin:
                    #Create dummy origin.
                    if aliases:
                        new_title = aliases[0]['title']
                        origin_entity_id = self.dbase.create_entity(
                            new_title, origin_type_id, classification_type_id,
                            language_id, country_id)

                #Format synopsis
                synopses = []
                if synopsis:
                    synopis_content = util.sanitize_content(synopsis)
                    if synopis_content:
                        synops = {}
                        synops['language_id'] = self.dbase.language_en
                        synops['content'] = synopis_content
                        synopses.append(synops)

            else:
                #Format name
                franchise_name = series_title.replace('(Franchise)', '')
                franchise_name = re.sub(self.pattern_replace_name, '',
                                        franchise_name)

                #Format description
                description = None
                if synopsis:
                    description = util.sanitize_content(synopsis)

                entities = []
                #Format associated entities
                for index, entity in enumerate(entities_text):
                    entity_name = util.sanitize_title(entities_text)
                    if entity_name:
                        #Get id from spider_item
                        entity_id = self.dbase.get_spider_item_id(
                            self.get_formatted_link(entities_url[index]),
                            'entity')
                        if not entity_id:
                            #Get id from alias.
                            where_values = []
                            where_values.append(entity_name)
                            entity_id = self.dbase.get_var(
                                'entity_alias', ['entity_id'], "name = %s",
                                where_values)
                        if not entity_id:
                            #Create dummy.
                            entity_id = self.dbase.create_entity(
                                entity_name, type_id,
                                self.dbase.classification_type_12,
                                self.dbase.language_ja, self.dbase.country_jp)
                            self.dbase.add_spider_item('entity', entity_id,
                                                       entities_url[index])
                        entities.append(entity_id)

        except ValueError as e:
            if not franchise:
                print "Error on formatting and getting IDs to save Series", e.message
            else:
                print "Error on formatting and getting IDs to save Franchise", e.message
            util.PrintException()
            util.Log(response.url, e.message)
            return
        except:
            if not franchise:
                print "Error on formatting Series", sys.exc_info()[0]
            else:
                print "Error on formatting Franchise", sys.exc_info()[0]
            util.PrintException()
            util.Log(response.url, sys.exc_info()[0])
            return

        try:
            self.dbase.set_auto_transaction(False)

            if franchise:
                collection_id = self.dbase.create_collection(
                    franchise_name, description, [], companies, aliases,
                    self.dbase.language_ja, update_id, entities)

                self.dbase.add_spider_item('collection', collection_id,
                                           response.url, True)
            else:
                entity_id = self.dbase.create_entity(
                    series_title, type_id, classification_type_id, language_id,
                    country_id, release_date, collection_id,
                    collection_started, aliases, [], synopses, wikies, [], [],
                    [], genres_id, [], companies, [], relateds, None,
                    new_images, update_id)

                if origin_entity_id:
                    self.dbase.add_relation_with_type(
                        'entity', 'entity', origin_entity_id, entity_id,
                        'based', self.dbase.based_type_adapted_from)

                for comment in comments:
                    self.dbase.add_comment(comment['title'],
                                           comment['content'], 1, entity_id,
                                           'entity')

                self.dbase.add_spider_item('entity', entity_id, response.url,
                                           True)

            self.dbase.commit()
            print "Success"

        except ValueError as e:
            self.dbase.rollback()
            print "Error on save Series", e.message
            util.PrintException()
            util.Log(response.url, e.message)
        except:
            self.dbase.rollback()
            print "Error on save Series", sys.exc_info()[0]
            util.PrintException()
            util.Log(response.url, sys.exc_info()[0])
        finally:
            self.dbase.set_auto_transaction(True)