Python GoogleDataCleanerの例、com.lish.pyutil.DataUtil.GoogleDataCleaner Pythonの例

コード例 #1

0

ファイルを表示

ファイル: extractor.py プロジェクト: yinonbaron/aminer-spider

    def pinMaxQuery(pubs):
        '''Query google scholar use: "xxx a" OR "xxx b" OR ..., max 256 chars. 
		return: query, used_pubs, nouse_pubs(write citation to -10 back to db)
		'''
        printout = False
        maxchars = 256
        query = ""
        total_pubs_used = 0
        used_pubs = []
        nouse_pubs = []
        total_title_length = 0
        for pub in pubs:
            # clean title
            cleaned_titles = GoogleDataCleaner.cleanGoogleTitle(pub.title)
            cleaned_title = cleaned_titles[0]

            # Add by gb Nov 05, 2011, filter out nouse titles.
            if cleaned_title is None or len(re.split('[\W+]',
                                                     cleaned_title)) < 3:
                print "**** no-use-title: ", cleaned_title
                #				pub.ncitation = -1;
                nouse_pubs.append(pub)
                continue

            # calc new length
            new_length = Extractor.__calc_control_char_length(
                total_pubs_used + 1) + total_title_length + len(cleaned_title)

            #
            splits = cleaned_title.split("\\s+")
            if splits is not None and len(splits) > 1:
                if total_pubs_used == 0:  # if the first one-word paper, only get this.
                    new_length += 255
                else:  # skip this one.
                    continue

            # first pub must be here, to avoid first pub title length > 255
            if total_pubs_used > 0 and new_length > maxchars:
                break  # overflow
            # real pin
            if total_pubs_used > 0:
                query += 'OR'
            query += ''.join(('"', cleaned_title, '"'))
            used_pubs.append(pub)
            total_pubs_used += 1
            total_title_length += len(cleaned_title)
        if printout:  # DEBUG PRINT
            blue_temple = "\033[34m%s\033[0m"
            print blue_temple % 'pin query done'
            q = ('query(%s): %s' % (len(query), query))
            print blue_temple % q
            t = 'use %s pubs' % total_pubs_used
            print blue_temple % t
        return query, used_pubs, nouse_pubs

コード例 #2

0

ファイルを表示

ファイル: extractor.py プロジェクト: AlexLyj/aminer-spider

	def pinMaxQuery(pubs):
		'''Query google scholar use: "xxx a" OR "xxx b" OR ..., max 256 chars. 
		return: query, used_pubs, nouse_pubs(write citation to -10 back to db)
		'''
		printout = False
		maxchars = 256
		query = ""
		total_pubs_used = 0
		used_pubs = []
		nouse_pubs = []
		total_title_length = 0
		for pub in pubs:
			# clean title
			cleaned_titles = GoogleDataCleaner.cleanGoogleTitle(pub.title)
			cleaned_title = cleaned_titles[0]
			
			# Add by gb Nov 05, 2011, filter out nouse titles.
			if cleaned_title is None or len(re.split('[\W+]', cleaned_title)) < 3:
				print "**** no-use-title: ", cleaned_title
#				pub.ncitation = -1;
				nouse_pubs.append(pub)
				continue
			
			
			# calc new length
			new_length = Extractor.__calc_control_char_length(total_pubs_used + 1) + total_title_length + len(cleaned_title)

			# 
			splits = cleaned_title.split("\\s+")
			if splits is not None and len(splits) > 1:
				if total_pubs_used == 0:  # if the first one-word paper, only get this.
					new_length += 255
				else:  # skip this one.
					continue

			# first pub must be here, to avoid first pub title length > 255
			if total_pubs_used > 0 and new_length > maxchars:
				break # overflow
			# real pin
			if total_pubs_used > 0:
				query += 'OR'
			query += ''.join(('"', cleaned_title, '"'))
			used_pubs.append(pub)
			total_pubs_used += 1
			total_title_length += len(cleaned_title)
		if printout:# DEBUG PRINT
			blue_temple = "\033[34m%s\033[0m"
			print blue_temple % 'pin query done'
			q = ('query(%s): %s' % (len(query), query))
			print blue_temple % q
			t = 'use %s pubs' % total_pubs_used
			print blue_temple % t
		return query, used_pubs, nouse_pubs

コード例 #3

0

ファイルを表示

    def matchPub(self,
                 pubs,
                 extracted_map,
                 check_person=False,
                 debug_output=False):
        '''Match pub with extracted
		@return (pubs_matched, pubs_not_matched)
		@params:
			pubs - Publication read from database.
			extracted_map - same with all_models {key_title:[ExtractedModel,...]}
			check_person - if True, will check if authors is matched with authors in db.(will ignore ...).
				default False. Search using author:xxx do not need author check, this work is done by google.
		'''
        if pubs is None or len(pubs) == 0: return [], pubs
        if extracted_map is None or len(extracted_map) == 0: return [], pubs
        if self.debug and False:
            print 'match %s pubs in %s extracted items' % (len(pubs),
                                                           len(extracted_map))

        # match
        print_not_matched = False
        pubs_matched = []
        pubs_not_matched = []

        for pub in pubs:
            cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title)
            key_title = cleanned_tuple[1]
            has_dot = cleanned_tuple[2]

            # find models list
            models = []

            if key_title in extracted_map:  # title is full, no ignore
                models = extracted_map[key_title]
            else:  # title in results has ..., ignored.
                if has_dot:
                    for short_key, extracted_models in extracted_map.items():
                        if key_title.find(short_key) != -1:
                            models.extend(extracted_models)

            # exact match
            if models is not None and len(models) > 0:
                max_citation_model = None
                debug_all_author_string = []
                for model in models:
                    debug_all_author_string.append(model.authors)
                    if model.authors is None or \
                    self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title):  # if author matched.
                        if max_citation_model is None or int(
                                max_citation_model.ncitation) < int(
                                    model.ncitation):
                            max_citation_model = model
                # select max citation?
                if max_citation_model is not None:
                    # allow 10% discount
                    if max_citation_model.ncitation >= pub.ncitation:
                        pub.ncitation = max_citation_model.ncitation
                        pub.increased = max_citation_model.ncitation - pub.ncitation
                    pubs_matched.append(pub)
                else:
                    # citation model not found.
                    if debug_output:
                        print "[DEBUG] PubMatcher.matchPub: Author not match. ", \
                         "\n\tTitle:%s \n\tRequired:%s \n\tGot(last):%s" % \
                         (pub.title, pub.authors, "\n".join(debug_all_author_string))

        # print not matched?
        for pub in pubs:
            title = pub.title
            found = False
            for matched in pubs_matched:
                if title == matched.title:
                    found = True
                    break
            if not found:
                pubs_not_matched.append(pub)
                if print_not_matched: print 'this pub not matched: ', pub

        return (pubs_matched, pubs_not_matched)

コード例 #4

0

ファイルを表示

ファイル: pubmatcher_v1_a_little_strict.py プロジェクト: AlexLyj/aminer-spider

	def matchPub(self, pubs, extracted_map, check_person=False, debug_output=False):
		'''Match pub with extracted
		@return (pubs_matched, pubs_not_matched)
		@params:
			pubs - Publication read from database.
			extracted_map - same with all_models {key_title:[ExtractedModel,...]}
			check_person - if True, will check if authors is matched with authors in db.(will ignore ...).
				default False. Search using author:xxx do not need author check, this work is done by google.
		'''
		if pubs is None or len(pubs) == 0: return [], pubs
		if extracted_map is None or len(extracted_map) == 0: return [], pubs
		if self.debug and False: print 'match %s pubs in %s extracted items' % (len(pubs), len(extracted_map))

		# match
		print_not_matched = False
		pubs_matched = []
		pubs_not_matched = []

		for pub in pubs:
			cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title)
			key_title = cleanned_tuple[1]
			has_dot = cleanned_tuple[2]
			
			# find models list
			models = []
			
			if key_title in extracted_map:  # title is full, no ignore
				models = extracted_map[key_title]
			else:  # title in results has ..., ignored.
				if has_dot:
					for short_key, extracted_models in extracted_map.items():
						if key_title.find(short_key) != -1:
							models.extend(extracted_models)

			# exact match 
			if models is not None and len(models) > 0:
				max_citation_model = None
				debug_all_author_string = []
				for model in models:
					debug_all_author_string.append(model.authors)
					if model.authors is None or \
					self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title):  # if author matched.
						if max_citation_model is None or int(max_citation_model.ncitation) < int(model.ncitation):
							max_citation_model = model
				# select max citation?
				if max_citation_model is not None:
					# allow 10% discount
					if  max_citation_model.ncitation >= pub.ncitation: 
						pub.ncitation = max_citation_model.ncitation
						pub.increased = max_citation_model.ncitation - pub.ncitation
					pubs_matched.append(pub)
				else:
					# citation model not found.
					if debug_output:
						print "[DEBUG] PubMatcher.matchPub: Author not match. ", \
							"\n\tTitle:%s \n\tRequired:%s \n\tGot(last):%s" % \
							(pub.title, pub.authors, "\n".join(debug_all_author_string))

		# print not matched?
		for pub in pubs:
			title = pub.title
			found = False;
			for matched in pubs_matched:
				if title == matched.title:
					found = True
					break
			if not found:
				pubs_not_matched.append(pub)
				if print_not_matched: print 'this pub not matched: ', pub

		return (pubs_matched, pubs_not_matched)

コード例 #5

0

ファイルを表示

ファイル: pubmatcher.py プロジェクト: yinonbaron/aminer-spider

    def matchPub(self,
                 pubs,
                 extracted_map,
                 check_person=False,
                 debug_output=False):
        '''Match pub with extracted
		@return (pubs_matched, pubs_not_matched)
		@params:
			pubs - Publication read from database.
			extracted_map - same with all_models {key_title:[ExtractedModel,...]}
			check_person - if True, will check if authors is matched with authors in db.(will ignore ...).
				default False. Search using author:xxx do not need author check, this work is done by google.
		'''
        if pubs is None or len(pubs) == 0:
            return [], pubs
        if extracted_map is None or len(extracted_map) == 0:
            return [], pubs
        if self.debug and False:
            print 'match %s pubs in %s extracted items' % (len(pubs),
                                                           len(extracted_map))

        # match
        print_not_matched = False
        pubs_matched = []
        pubs_not_matched = []

        for pub in pubs:
            cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title)
            key_title = cleanned_tuple[1]
            has_dot = cleanned_tuple[2]

            # First Match, select loose match. ExtractedPub List
            models = []

            # different with v1, a looser match.
            # title allow 10% mistake.
            # author loose match.
            for short_key, extracted_models in extracted_map.items():
                matched = False
                if has_dot:
                    # ignore and contains
                    if key_title.find(short_key) != -1:
                        matched = True
                        _m = extracted_models
                        for m in _m:  # Add Loose Value
                            m.looseValue += 1
                        models.extend(_m)
                else:
                    # direct match
                    if key_title == short_key:
                        matched = True
                        _m = extracted_map[key_title]
                        for m in _m:  # Add Loose Value
                            m.looseValue += 0
                        models.extend(_m)

                # try loose match
                if not matched:
                    ed = editdist.distance(short_key, key_title)
                    if ed < 10:
                        looseValue = float(len(key_title)) * (10 / float(100))
                        if looseValue > ed:  # remove ed not match much
                            _m = extracted_models
                            for m in _m:
                                m.looseValue += ed
                            models.extend(_m)
#							if True and ed < 10:
#								print '-' * 100
#								print 'title: %s ' % key_title
#								print 'short: %s ' % short_key
#								print 'ed is: %s ' % ed
#								print 'loose: %s ' % looseValue

# Exact match, select who is the right one.
            if models is not None and len(models) > 0:
                max_citation_model = None
                for model in models:
                    if model.authors is None or \
                    self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title):  # if author matched.
                        if max_citation_model is None or int(
                                max_citation_model.ncitation) < int(
                                    model.ncitation):
                            max_citation_model = model
                # select max citation?
                if max_citation_model is not None:
                    if max_citation_model.ncitation >= pub.ncitation:
                        pub.ncitation = max_citation_model.ncitation
                        pub.increased = max_citation_model.ncitation - pub.ncitation

                    pub.pdflink = max_citation_model.pdfLink
                    pub.web_url = max_citation_model.web_url
                    pubs_matched.append(pub)
                    if pub.pdflink is None:
                        file_object = open('web_url.txt', 'a')
                        web_url = pub.web_url
                        Id = str(pub.id)
                        Title = pub.title
                        Author = str(pub.authors)
                        file_object.write(" ".join(
                            [Id, Title, Author, web_url]))
                        file_object.write("\n")
                        file_object.close()
                    else:
                        file_object = open('paper_link.txt', 'a')
                        Id = str(pub.id)
                        Title = pub.title

                        Pdflink = str(pub.pdflink)
                        Author = str(pub.authors)
                        file_object.write(" ".join(
                            [Id, Title, Author, Pdflink]))
                        file_object.write("\n")
                        file_object.close()

        for pub in pubs:
            title = pub.title
            found = False
            for matched in pubs_matched:
                if title == matched.title:
                    found = True
                    break
            if not found:
                pubs_not_matched.append(pub)
                if print_not_matched:
                    print 'this pub not matched: ', pub

        return (pubs_matched, pubs_not_matched)

コード例 #6

0

ファイルを表示

ファイル: pubmatcher.py プロジェクト: AlexLyj/aminer-spider

	def matchPub(self, pubs, extracted_map, check_person=False, debug_output=False):
		'''Match pub with extracted
		@return (pubs_matched, pubs_not_matched)
		@params:
			pubs - Publication read from database.
			extracted_map - same with all_models {key_title:[ExtractedModel,...]}
			check_person - if True, will check if authors is matched with authors in db.(will ignore ...).
				default False. Search using author:xxx do not need author check, this work is done by google.
		'''
		if pubs is None or len(pubs) == 0: 
			return [], pubs
		if extracted_map is None or len(extracted_map) == 0: 
			return [], pubs
		if self.debug and False: 
			print 'match %s pubs in %s extracted items' % (len(pubs), len(extracted_map))
			
		# match
		print_not_matched = False
		pubs_matched = []
		pubs_not_matched = []

		for pub in pubs:
			cleanned_tuple = GoogleDataCleaner.cleanGoogleTitle(pub.title)
			key_title = cleanned_tuple[1]
			has_dot = cleanned_tuple[2]
			
			# First Match, select loose match. ExtractedPub List
			models = []
			
			# different with v1, a looser match.
			# title allow 10% mistake.
			# author loose match.
			for short_key, extracted_models in extracted_map.items():
				matched = False
				if has_dot:
					# ignore and contains
					if key_title.find(short_key) != -1:
						matched = True
						_m = extracted_models
						for m in _m: # Add Loose Value
							m.looseValue += 1;
						models.extend(_m)
				else:
					# direct match
					if key_title == short_key:
						matched = True
						_m = extracted_map[key_title]
						for m in _m: # Add Loose Value
							m.looseValue += 0;
						models.extend(_m);
			
				# try loose match
				if not matched:
					ed = editdist.distance(short_key, key_title)
					if ed < 10:
						looseValue = float(len(key_title)) * (10 / float(100))
						if looseValue > ed: # remove ed not match much
							_m = extracted_models
							for m in _m:
								m.looseValue += ed;
							models.extend(_m)
#							if True and ed < 10:
#								print '-' * 100
#								print 'title: %s ' % key_title
#								print 'short: %s ' % short_key
#								print 'ed is: %s ' % ed
#								print 'loose: %s ' % looseValue
					
			# Exact match, select who is the right one.
			if models is not None and len(models) > 0:
				max_citation_model = None
				for model in models:
					if model.authors is None or \
					self.matchAuthors(model.authors, pub.authors, debug_output=False, debug_title=model.title):  # if author matched.
						if max_citation_model is None or int(max_citation_model.ncitation) < int(model.ncitation):
							max_citation_model = model
				# select max citation?
				if max_citation_model is not None:
					if  max_citation_model.ncitation >= pub.ncitation: 
						pub.ncitation = max_citation_model.ncitation
						pub.increased = max_citation_model.ncitation - pub.ncitation

					pub.pdflink = max_citation_model.pdfLink
					pub.web_url = max_citation_model.web_url
					pubs_matched.append(pub)
					if pub.pdflink is None:
						file_object = open('web_url.txt', 'a')
						web_url= pub.web_url
						Id = str(pub.id)
						Title = pub.title
						Author = str(pub.authors)
						file_object.write(" ".join([Id, Title, Author, web_url]))
						file_object.write("\n")
						file_object.close()
					else: 
						file_object = open('paper_link.txt', 'a')
						Id = str(pub.id)
						Title = pub.title
								
						Pdflink = str(pub.pdflink)
						Author = str(pub.authors)
						file_object.write(" ".join([Id, Title, Author, Pdflink]))
						file_object.write("\n")
						file_object.close()

		for pub in pubs:
			title = pub.title
			found = False;
			for matched in pubs_matched:
				if title == matched.title:
					found = True
					break
			if not found:
				pubs_not_matched.append(pub)
				if print_not_matched: 
					print 'this pub not matched: ', pub

		return (pubs_matched, pubs_not_matched)