Esempio n. 1
0
def source_attribute(req, selectionId, sourceId, attribute, attributeId):
    get_object_or_404(Selection, pk=selectionId)
    get_object_or_404(Source, pk=sourceId)
    if attribute in source_tag_types:
        return tag(req, attributeId)
    elif attribute == 'author':
        return author(req, attributeId)
    else: raise Http404
def read_documents(author_dir):
	document_list = []
	auth = author(os.path.basename(author_dir))					# Constructs 'author' object with given name.
	for doc_name in os.listdir(author_dir):
		if doc_name.endswith('.txt'):
			doc_text = open(author_dir + "/" + doc_name,'r').read()						
			document_list.append(document(auth.name + doc_name[:-4], auth.name, doc_text))		# Constructs documents with given file and author,
	auth.doc_list = document_list
	auth.doc_count = len(document_list)
	return auth																# adds them into a list. Then returns author object.
Esempio n. 3
0
def addBook():
    error = None
    if request.method == 'POST':
        global b
        global writer
        b = book.book(request.form['ISBN'], request.form['title'],
                      request.form['yearPublished'])
        writer = author.author(request.form['firstName'],
                               request.form['lastName'],
                               request.form['country'])
        if (db.bookExist(b) == False):
            try:
                db.addBook(b)
            except:
                error = 'Invalid Data. Please try again.'
            if (db.authorExist(writer) == False):
                try:
                    db.addAuthor(writer)
                except:
                    error = 'Invalid Data. Please try again.'
        try:
            db.authorId(writer)
        except:
            error = 'Author Not Correct For Book. Please try again.'
        if (db.readsExist(person, b) == False):
            try:
                db.addReads(person, b, request.form['rate'])
            except:
                error = 'Error. Please try again.'
        if (db.writesExist(writer, b) == False):
            try:
                db.addWrites(b, writer)
            except:
                error = 'Error HERE. Please try again.'

    return render_template('addBook.html', error=error)
Esempio n. 4
0
    def do_author(self):
        from author import author

        return author()
Esempio n. 5
0
 def do_atom_contributor(self):
     from author import author
     return author()
Esempio n. 6
0
 def do_atom_author(self):
     from author import author
     return author(), noduplicates()
Esempio n. 7
0
 def do_contributor(self):
   self.metadata()
   from author import author
   return author()
Esempio n. 8
0
 def do_author(self):
   self.metadata()
   from author import author
   return author()
Esempio n. 9
0
 def do_atom_author(self):
     from author import author
     return author()
Esempio n. 10
0
	def parse(self,xml_string,input_file_name,curs):
		
		
		parser_dtd=etree.XMLParser(encoding='ISO-8859-1',dtd_validation=True,load_dtd=True,remove_comments=True,recover=True)
		root = etree.fromstring(xml_string.encode('ISO-8859-1'),parser_dtd)
		#print(type(root)) 
		for REC in root:
			
			#Parse publication and create a publication object containing all the attributes of publication
			new_pub=pub.publication()

			author_names=[]
		
			new_pub.source_type=REC.tag
			
			new_pub.source_id=REC.attrib.get('key')
			
			if 'mdate' in REC.attrib:
				new_pub.modified_date=REC.attrib.get('mdate')

			if 'publtype' in REC.attrib:
				new_pub.document_type=REC.attrib.get('publtype')

			#Can be more than 1 author
			author_fields=REC.findall('author')
			if author_fields is not None:
				for auth in author_fields:
						if 'orcid' in auth.attrib:
							author_names.append((auth.text,auth.attrib.get('orcid')))
						else:
							author_names.append((auth.text,None))
				
			pages=REC.find('pages')
			if pages is not None:
				if len(pages.text.split('-')) == 2:
					new_pub.begin_page=pages.text.split('-')[0]
					new_pub.end_page=pages.text.split('-')[1]
				else:
					new_pub.begin_page=pages.text.split('-')[0]

			title=REC.find('title')
			if title is not None:
				new_pub.document_title=title.text

			issue_no=REC.find('number')
			if issue_no is not None:
				new_pub.issue=issue_no.text

			year=REC.find('year')
			if year is not None:
				new_pub.publication_year=year.text

			address=REC.find('address')
			if address is not None:
				new_pub.publisher_address=address.text

			publisher=REC.find('publisher')
			if publisher is not None:
				new_pub.publisher_name=publisher.text

			vol=REC.find('volume')
			if vol is not None:
				new_pub.volume=vol.text

			s_title=REC.find('journal')
			if s_title is not None:
				new_pub.source_title=s_title.text
			else:
				s_title=REC.find('booktitle')
				if s_title is not None:
					new_pub.source_title=s_title.text

			#Query to insert publication record into the publications table in the database
			curs.execute("INSERT INTO dblp_publications (begin_page,modified_date,document_title,document_type,end_page,issue,"\
				"publication_year,publisher_address,publisher_name,source_id,source_title,source_type,volume)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"\
				" ON CONFLICT (source_id) DO UPDATE SET begin_page=excluded.begin_page,modified_date=excluded.modified_date,document_title=excluded.document_title,"\
				"document_type=excluded.document_type,end_page=excluded.end_page,issue=excluded.issue,publication_year=excluded.publication_year,"\
				"publisher_address=excluded.publisher_address,publisher_name=excluded.publisher_name,source_id=excluded.source_id,source_title=excluded.source_title,"\
				"source_type=excluded.source_type,volume=excluded.volume,last_updated_time=current_timestamp;",
				(str(new_pub.begin_page),new_pub.modified_date,str(new_pub.document_title),str(new_pub.document_type),str(new_pub.end_page),str(new_pub.issue),
					str(new_pub.publication_year),str(new_pub.publisher_address),str(new_pub.publisher_name),str(new_pub.source_id),str(new_pub.source_title),str(new_pub.source_type),
					str(new_pub.volume)))


			#parse document identifier fields for each publication
			new_doc=doc.document()

			#A dictionary which stores all the document id's and types
			docs=dict()


			new_doc.source_id=new_pub.source_id
			ee=REC.findall('ee')
			if ee is not None:
				for i in ee:
					docs[i.text]=i.tag
				
			url=REC.findall('url')
			if url is not None:
				for i in url:
					docs[i.text]=i.tag
			
			crossref=REC.findall('crossref')
			if crossref is not None:
				for i in crossref:
					docs[i.text]=i.tag

			isbn=REC.find('isbn')
			if isbn is not None:
				docs[isbn.text]=isbn.tag
				
			series=REC.find('series')
			if series is not None:
				docs[series.text]=series.tag
				
			cdrom=REC.find('cdrom')
			if cdrom is not None:
				docs[cdrom.text]=cdrom.tag

			school=REC.find('school')
			if school is not None:
				docs[school.text]=school.tag

			notes=REC.find('notes')
			if notes is not None:
				docs[notes.text]=notes.tag

			#Inserting records into dblp_document_identifiers
			for text,tag in docs.items():
				new_doc.document_id=text
				new_doc.document_id_type=tag

				curs.execute("INSERT INTO dblp_document_identifiers(source_id,document_id,document_id_type) VALUES(%s,%s,%s)"\
					"ON CONFLICT (source_id,document_id,document_id_type) DO UPDATE SET source_id=excluded.source_id,"\
					"document_id=excluded.document_id,document_id_type=excluded.document_id_type,last_updated_time=current_timestamp;",
					(str(new_doc.source_id),str(new_doc.document_id),str(new_doc.document_id_type)))

			#parse author fields for dblp_authors
			new_auth=author.author()

			editor=REC.find('editor')
			if editor is not None:
				new_auth.editor_name=editor.text

			seq_no=0	

			for name in author_names:
				new_auth.first_name=' '.join(name[0].split()[:-1])
				new_auth.last_name=name[0].split()[-1]
				new_auth.full_name=name[0]
				new_auth.source_id=new_pub.source_id
				new_auth.seq_no=seq_no
				if name[1] is not None:
					new_auth.orc_id=name[1]


				curs.execute("INSERT INTO dblp_authors(source_id,full_name,last_name,first_name,seq_no,orc_id,editor_name)"\
					"VALUES(%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (source_id,seq_no) DO UPDATE SET source_id=excluded.source_id,"\
					"full_name=excluded.full_name,last_name=excluded.last_name,first_name=excluded.first_name,seq_no=excluded.seq_no,"\
					"orc_id=excluded.orc_id,editor_name=excluded.editor_name,last_updated_time=current_timestamp;",(str(new_auth.source_id),str(new_auth.full_name),str(new_auth.last_name),
						str(new_auth.first_name),str(new_auth.seq_no),str(new_auth.orc_id),str(new_auth.editor_name)))

				seq_no+=1

			#parser citataion fields for dblp_references

			new_ref=reference.reference()
			new_ref.source_id=new_pub.source_id

			citations=REC.findall('cite')
			if citations is not None:
				for cite in citations:
					if cite != '...':
						new_ref.cited_source_id=cite.text

						curs.execute("INSERT INTO dblp_references(source_id,cited_source_id) VALUES(%s,%s) ON CONFLICT ON CONSTRAINT"\
							" dblp_references_pk DO UPDATE SET source_id=excluded.source_id,cited_source_id=excluded.cited_source_id,"\
							"last_updated_time=current_timestamp;",(str(new_ref.source_id),str(new_ref.cited_source_id)))
Esempio n. 11
0
 def do_author(self):
     self.metadata()
     from author import author
     return author()
Esempio n. 12
0
    def parse(self, xml_string, input_file_name, curs):
        url = '<xml header URL>'
        root = ET.fromstring(xml_string)
        for REC in root:
            # parse publications and create a publication object containing all the attributes of publication
            new_pub = pub.publication()


            new_pub.source_id = REC.find(url + 'UID').text

            pub_info = REC.find('.//' + url + 'pub_info')
            new_pub.source_type = pub_info.get('pubtype')

            source_title = REC.find('.//' + url + "title[@type='source']")

            if source_title is not None:
                if source_title.text is not None:
                    new_pub.source_title = source_title.text.encode('utf-8')
            # extracting values from properties of pub_info tag in XMl
            new_pub.has_abstract = pub_info.get('has_abstract')
            new_pub.publication_year = pub_info.get('pubyear')
            new_pub.issue = pub_info.get('issue')
            new_pub.volume = pub_info.get('vol')
            new_pub.pubmonth = pub_info.get('pubmonth')
            new_pub.publication_date = pub_info.get('sortdate')
            new_pub.coverdate = pub_info.get('coverdate')

            page_info = pub_info.find(url + 'page')
            if page_info is not None:
                new_pub.begin_page = page_info.get('begin')
                new_pub.end_page = page_info.get('end')

            document_title = REC.find('.//' + url + "title[@type='item']")
            if document_title is not None:
                if document_title.text is not None:
                    new_pub.document_title = document_title.text. \
                        encode('utf-8')

            document_type = REC.find('.//' + url + 'doctype')
            if document_type is not None:
                if document_type.text is not None:
                    new_pub.document_type = document_type.text

            publisher_name = REC.find('.//' + url + "name[@role='publisher']")
            if publisher_name is not None:
                pub_name = publisher_name.find('.//' + url + 'full_name')
                if pub_name is not None:
                    if pub_name.text is not None:
                        new_pub.publisher_name = pub_name.text. \
                            encode('utf-8')

            pub_address_no = REC.find('.//' + url + "address_spec[@addr_no='1']")
            if pub_address_no is not None:
                publisher_address = pub_address_no.find('.//' + url + 'full_address')
                if publisher_address is not None:
                    if publisher_address.text is not None:
                        new_pub.publisher_address = publisher_address.text. \
                            encode('utf-8')

            languages = REC.find('.//' + url + 'languages')
            if languages is not None:
                language = languages.find('.//' + url + 'language')
                if language is not None:
                    if language.text is not None:
                        new_pub.language = language.text.encode('utf-8')

            new_pub.edition = REC.find('.//' + url + 'edition').get('value')
            new_pub.source_filename = input_file_name
            new_pub.created_date = datetime.date.today()
            new_pub.last_modified_date = datetime.date.today()
            ## query to insert a publication record into the publications table in the database
            ## The query may be written into a saperate file in future from where it is read in the form of a string ammended values and executed to make code look better
            # TODO Query below is hard to read. I'd try a multi-line string with the proper SQL formatting.
            curs.execute(
                '<query to upsert data in database>')

            # parse grants in funding acknowledgements for each publication
            # New method of creating an object to store everything in the form of proper objects which could be developed into classes having their own properties in future
            r_grant = grant.grant()
            r_grant.source_id = new_pub.source_id

            # r_grant.funding_ack = ''
            FUNDING_ACK = REC.find('.//' + url + 'fund_text')

            if FUNDING_ACK is not None:  # if funding acknowledgement exists, then extract the r_grant(s) data
                funding_ack_p = FUNDING_ACK.find('.//' + url + 'p')
                if funding_ack_p is not None:
                    if funding_ack_p.text is not None:
                        r_grant.funding_ack = funding_ack_p.text.encode('utf-8')
            # looping through all the r_grant tags
            for l_grant in REC.findall('.//' + url + 'grant'):
                # r_grant.grant_agency = ''
                grant_agency = l_grant.find('.//' + url + 'grant_agency')
                if grant_agency is not None:
                    if grant_agency.text is not None:
                        r_grant.grant_agency = grant_agency.text.encode('utf-8')

                grant_ids = l_grant.find('.//' + url + 'grant_ids')
                if grant_ids is not None:
                    for grant_id in grant_ids.findall('.//' + url + 'grant_id'):
                        if grant_id is not None:
                            if grant_id.text is not None:
                                r_grant.grant_number = grant_id.text.encode('utf-8')
                        if r_grant.funding_ack is not None:
                            # insert the grant details in the grants table if there is any funding acknowledgement in the records
                            curs.execute(
                                '<query to upsert data in database>')


            # insert code to insert record in r_grant table
            # parse document object identifiers for each publication
            r_dois = dois.dois()
            r_dois.source_id = new_pub.source_id

            IDS = REC.find('.//' + url + 'identifiers')
            if IDS is not None:
                for identifier in IDS.findall('.//' + url + 'identifier'):
                    id_value = identifier.get('value')
                    if id_value is not None:
                        r_dois.doi = id_value.encode('utf-8')
                    id_type = identifier.get('type')
                    if id_type is not None:
                        r_dois.doi_type = id_type.encode('utf-8')
                    if r_dois.doi is not None:
                        # insering records into wos_document_identifier table
                        curs.execute(
                            '<query to upsert data in database>')


            # parse keyword for each publication
            keywords = REC.find('.//' + url + 'keywords_plus')
            if keywords is not None:
                r_keyword = key_word.wos_keyword()
                r_keyword.source_id = new_pub.source_id
                for keyword in keywords.findall('.//' + url + 'keyword'):
                    if keyword is not None:
                        if keyword.text is not None:
                            r_keyword.keyword = keyword.text.encode('utf-8')
                            # inserting records in wos_keywords
                            curs.execute(
                                '<query to upsert data in database>')


            # parse abstract for each publication
            if new_pub.has_abstract == 'Y':
                abstracts = REC.find('.//' + url + 'abstracts')
                if abstracts is not None:
                    r_abst = abst.abstract()
                    r_abst.source_id = new_pub.source_id
                    r_abstract_text = ''
                    for abstract_text in abstracts.findall('.//' + url + 'p'):
                        if abstract_text is not None:
                            if abstract_text.text is not None:
                                if r_abstract_text != '' and abstract_text.text != '':
                                    r_abstract_text = r_abstract_text.join('\n\n')
                                r_abstract_text = r_abstract_text + abstract_text.text.encode('utf-8')
                    # adding all the abstract paragraphs into one before writing it into the database
                    r_abst.abstract_text = re.sub( r"^[\n]+", "",r_abstract_text)
                    # writing the abstracts record into the data base
                    curs.execute(
                        '<query to upsert data in database>')



            # parse addresses for each publication

            r_addr = add.address()
            addr_no_list = []
            addresses = REC.find('.//' + url + 'addresses')
            for addr in addresses.findall('.//' + url + 'address_spec'):

                addr_ind = addr.get('addr_no')
                if addr_ind is None:
                    addr_ind = 0
                else:
                    addr_ind = int(addr_ind)
                    # Kepp all addr_no for the following reference by authors
                    addr_no_list.append(int(addr_ind))

                r_addr.source_id[addr_ind] = new_pub.source_id
                r_addr.addr_name[addr_ind] = ''
                addr_name = addr.find('.//' + url + 'full_address')
                if addr_name is not None:
                    if addr_name.text is not None:
                        r_addr.addr_name[addr_ind] = addr_name.text.encode('utf-8')
                r_addr.organization[addr_ind] = ''
                organization = addr.find('.//' + url + "organization[@pref='Y']")
                if organization is not None:
                    if organization.text is not None:
                        r_addr.organization[addr_ind] = organization.text. \
                            encode('utf-8')
                r_addr.sub_organization[addr_ind] = ''
                suborganization = addr.find('.//' + url + 'suborganization')
                if suborganization is not None:
                    if suborganization.text is not None:
                        r_addr.sub_organization[addr_ind] = suborganization.text. \
                            encode('utf-8')
                r_addr.city[addr_ind] = ''
                city = addr.find('.//' + url + 'city')
                if city is not None:
                    if city.text is not None:
                        r_addr.city[addr_ind] = city.text.encode('utf-8')
                r_addr.country[addr_ind] = ''
                country = addr.find('.//' + url + 'country')
                if country is not None:
                    if country.text is not None:
                        r_addr.country[addr_ind] = country.text.encode('utf-8')
                r_addr.zip_code[addr_ind] = ''
                addr_zip = addr.find('.//' + url + 'zip')
                if addr_zip is not None:
                    if addr_zip.text is not None:
                        r_addr.zip_code[addr_ind] = addr_zip.text.encode('utf-8')
                if r_addr.addr_name[addr_ind] is not None:
                    # Insering address records into database and retrieving and storing the address_id for future use in authors insertion
                    curs.execute(
                        '<query to upsert data in database>')
                    r_addr.id[addr_ind] = curs.fetchone()[0]


            # parse titles for each publication
            r_title = ti.title()
            r_title.source_id = new_pub.source_id

            summary = REC.find('.//' + url + 'summary')
            if summary is not None:
                titles = summary.find('.//' + url + 'titles')
                if titles is not None:
                    for title in titles.findall('.//' + url + 'title'):
                        if title is not None:
                            if title.text is not None:
                                r_title.title = title.text.encode('utf-8')
                                r_title.type = title.get('type')
                                # inserting titles into the database
                                curs.execute(
                                    '<query to upsert data in database>')


            # parse authors for each publication
            r_author = auth.author()
            r_author.source_id = new_pub.source_id

            summary = REC.find('.//' + url + 'summary')
            names = summary.find(url + 'names')
            for name in names.findall(url + "name[@role='author']"):
                full_name = name.find(url + 'full_name')
                if full_name is not None:
                    if full_name.text is not None:
                        r_author.full_name = full_name.text.encode('utf-8')
                wos_standard = name.find(url + 'wos_standard')
                if wos_standard is not None:
                    if wos_standard.text is not None:
                        r_author.wos_standard = wos_standard.text.encode('utf-8')
                r_author.first_name = ''
                first_name = name.find(url + 'first_name')
                if first_name is not None:
                    if first_name.text is not None:
                        r_author.first_name = first_name.text.encode('utf-8')
                last_name = name.find(url + 'last_name')
                if last_name is not None:
                    if last_name.text is not None:
                        r_author.last_name = last_name.text.encode('utf-8')
                email_addr = name.find(url + 'email_addr')
                if email_addr is not None:
                    if email_addr.text is not None:
                        r_author.email_addr = email_addr.text.encode('utf-8')

                r_author.seq_no = name.get('seq_no')
                r_author.dais_id = name.get('dais_id')
                if (r_author.dais_id == None):
                    r_author.dais_id = ''
                r_author.r_id = name.get('r_id')
                if (r_author.r_id == None):
                    r_author.r_id = ''
                addr_seqs = name.get('addr_no')
                r_author.address_id = ''
                r_author.addr_seq = ''
                if addr_seqs is not None:
                    addr_no_str = addr_seqs.split(' ')
                    for addr_seq in addr_no_str:
                        if addr_seq is not None:
                            addr_index = int(addr_seq)
                            if addr_index in addr_no_list:
                                r_author.address = r_addr.addr_name[addr_index]
                                r_author.address_id = r_addr.id[addr_index]
                                r_author.addr_seq = addr_seq
                                curs.execute(
                                    '<query to upsert data in database>')

                else:
                    r_author.address_id = 0
                    r_author.addr_seq = 0
                    # inserting records into author tables of database
                    curs.execute(
                        '<query to upsert data in database>')


            # parse reference data for each publication
            REFERENCES = REC.find('.//' + url + 'references')
            for ref in REFERENCES.findall('.//' + url + 'reference'):
                r_reference = reference.reference()
                r_reference.source_id = new_pub.source_id
                r_reference.cited_source_uid = None
                cited_source_id = ref.find('.//' + url + 'uid')
                if cited_source_id is not None:
                    if cited_source_id.text is not None:
                        r_reference.cited_source_uid = cited_source_id.text. \
                            encode('utf-8')
                cited_title = ref.find('.//' + url + 'citedTitle')
                if cited_title is not None:
                    if cited_title.text is not None:
                        r_reference.cited_title = cited_title.text.encode('utf-8')
                r_reference.cited_work = ''
                cited_work = ref.find('.//' + url + 'citedWork')
                if cited_work is not None:
                    if cited_work.text is not None:
                        r_reference.cited_work = cited_work.text.encode('utf-8')
                cited_author = ref.find('.//' + url + 'citedAuthor')
                if cited_author is not None:
                    if cited_author.text is not None:
                        r_reference.cited_author = cited_author.text.encode('utf-8')[:299]
                cited_year = ref.find('.//' + url + 'year')
                if cited_year is not None:
                    if cited_year.text is not None:
                        r_reference.cited_year = cited_year.text.encode('utf-8')
                cited_page = ref.find('.//' + url + 'page')
                if cited_page is not None:
                    if cited_page.text is not None:
                        r_reference.cited_page = cited_page.text.encode('utf-8')

                r_reference.created_date = new_pub.created_date
                r_reference.last_modified_date = new_pub.last_modified_date
                if r_reference.cited_source_uid is not None:
                    # inserting references into database
                    curs.execute(
                        '<query to upsert data in database>')
Esempio n. 13
0
 def do_atom_author(self):
   from author import author
   return author(), noduplicates()
Esempio n. 14
0
    def parse(self, xml_string, input_file_name, curs):
        url = '{http://clarivate.com/schema/wok5.27/public/FullRecord}'
        root = ET.fromstring(xml_string)
        for REC in root:
            # parse publications and create a publication object containing all the attributes of publication
            new_pub = pub.publication()


            new_pub.source_id = REC.find(url + 'UID').text

            pub_info = REC.find('.//' + url + 'pub_info')
            new_pub.source_type = pub_info.get('pubtype')

            source_title = REC.find('.//' + url + "title[@type='source']")

            if source_title is not None:
                if source_title.text is not None:
                    new_pub.source_title = source_title.text.encode('utf-8')
            # extracting values from properties of pub_info tag in XMl
            new_pub.has_abstract = pub_info.get('has_abstract')
            new_pub.publication_year = pub_info.get('pubyear')
            new_pub.issue = pub_info.get('issue')
            new_pub.volume = pub_info.get('vol')
            new_pub.pubmonth = pub_info.get('pubmonth')
            new_pub.publication_date = pub_info.get('sortdate')
            new_pub.coverdate = pub_info.get('coverdate')

            page_info = pub_info.find(url + 'page')
            if page_info is not None:
                new_pub.begin_page = page_info.get('begin')
                new_pub.end_page = page_info.get('end')

            document_title = REC.find('.//' + url + "title[@type='item']")
            if document_title is not None:
                if document_title.text is not None:
                    new_pub.document_title = document_title.text. \
                        encode('utf-8')

            document_type = REC.find('.//' + url + 'doctype')
            if document_type is not None:
                if document_type.text is not None:
                    new_pub.document_type = document_type.text

            publisher_name = REC.find('.//' + url + "name[@role='publisher']")
            if publisher_name is not None:
                pub_name = publisher_name.find('.//' + url + 'full_name')
                if pub_name is not None:
                    if pub_name.text is not None:
                        new_pub.publisher_name = pub_name.text. \
                            encode('utf-8')

            pub_address_no = REC.find('.//' + url + "address_spec[@addr_no='1']")
            if pub_address_no is not None:
                publisher_address = pub_address_no.find('.//' + url + 'full_address')
                if publisher_address is not None:
                    if publisher_address.text is not None:
                        new_pub.publisher_address = publisher_address.text. \
                            encode('utf-8')

            languages = REC.find('.//' + url + 'languages')
            if languages is not None:
                language = languages.find('.//' + url + 'language')
                if language is not None:
                    if language.text is not None:
                        new_pub.language = language.text.encode('utf-8')

            new_pub.edition = REC.find('.//' + url + 'edition').get('value')
            new_pub.source_filename = input_file_name
            new_pub.created_date = datetime.date.today()
            new_pub.last_modified_date = datetime.date.today()
            ## query to insert a publication record into the publications table in the database
            ## The query may be written into a saperate file in future from where it is read in the form of a string ammended values and executed to make code look better
            # TODO Query below is hard to read. I'd try a multi-line string with the proper SQL formatting.
            curs.execute(
                "INSERT INTO wos_publications(begin_page, created_date, document_title, document_type,edition, end_page,has_abstract,issue,"\
                    "language,last_modified_date,publication_date,publication_year,publisher_address,publisher_name,source_filename,source_id,"\
                    "source_title,source_type,volume)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (source_id)"\
                    "DO UPDATE SET begin_page = excluded.begin_page, created_date = excluded.created_date,document_title ="\
                    " excluded.document_title, document_type = excluded.document_type, edition = excluded.edition,end_page ="\
                    "excluded.end_page, has_abstract = excluded.has_abstract, issue = excluded.issue,language = excluded.language,"\
                    "last_modified_date = excluded.last_modified_date,publication_date = excluded.publication_date, publication_year"\
                    "= excluded.publication_year,publisher_address = excluded.publisher_address, publisher_name = excluded.publisher_name,"\
                    "source_filename = excluded.source_filename, source_id = excluded.source_id, source_title = excluded.source_title,"\
                    "source_type = excluded.source_type, volume = excluded.volume, last_updated_time=current_timestamp;",
                (str(new_pub.begin_page), new_pub.created_date, str(new_pub.document_title),
                 str(new_pub.document_type), str(new_pub.edition), str(new_pub.end_page), str(new_pub.has_abstract),
                 str(new_pub.issue), str(new_pub.language), new_pub.last_modified_date,
                 new_pub.publication_date, str(new_pub.publication_year), str(new_pub.publisher_address),
                 str(new_pub.publisher_name), str(new_pub.source_filename), str(new_pub.source_id),
                 str(new_pub.source_title), new_pub.source_type, str(new_pub.volume)))

            # parse grants in funding acknowledgements for each publication
            # New method of creating an object to store everything in the form of proper objects which could be developed into classes having their own properties in future
            r_grant = grant.grant()
            r_grant.source_id = new_pub.source_id

            # r_grant.funding_ack = ''
            FUNDING_ACK = REC.find('.//' + url + 'fund_text')

            if FUNDING_ACK is not None:  # if funding acknowledgement exists, then extract the r_grant(s) data
                funding_ack_p = FUNDING_ACK.find('.//' + url + 'p')
                if funding_ack_p is not None:
                    if funding_ack_p.text is not None:
                        r_grant.funding_ack = funding_ack_p.text.encode('utf-8')
            # looping through all the r_grant tags
            for l_grant in REC.findall('.//' + url + 'grant'):
                # r_grant.grant_agency = ''
                grant_agency = l_grant.find('.//' + url + 'grant_agency')
                if grant_agency is not None:
                    if grant_agency.text is not None:
                        r_grant.grant_agency = grant_agency.text.encode('utf-8')

                grant_ids = l_grant.find('.//' + url + 'grant_ids')
                if grant_ids is not None:
                    for grant_id in grant_ids.findall('.//' + url + 'grant_id'):
                        if grant_id is not None:
                            if grant_id.text is not None:
                                r_grant.grant_number = grant_id.text.encode('utf-8')
                        if r_grant.funding_ack is not None:
                            # insert the grant details in the grants table if there is any funding acknowledgement in the records
                            curs.execute(
                                "INSERT INTO wos_grants(source_id,grant_number,grant_organization,funding_ack,source_filename)VALUES"\
                                        "(%s,%s,%s,%s,%s) ON CONFLICT (source_id, grant_number, grant_organization) DO UPDATE SET source_id"\
                                        "= excluded.source_id, grant_number = excluded.grant_number,grant_organization ="\
                                        "excluded.grant_organization, funding_ack = excluded.funding_ack,source_filename ="\
                                        "excluded.source_filename, last_updated_time=current_timestamp;",
                                (str(r_grant.source_id), str(r_grant.grant_number),
                                 str(r_grant.grant_agency), str(r_grant.funding_ack), str(new_pub.source_filename)))


            # insert code to insert record in r_grant table
            # parse document object identifiers for each publication
            r_dois = dois.dois()
            r_dois.source_id = new_pub.source_id

            IDS = REC.find('.//' + url + 'identifiers')
            if IDS is not None:
                for identifier in IDS.findall('.//' + url + 'identifier'):
                    id_value = identifier.get('value')
                    if id_value is not None:
                        r_dois.doi = id_value.encode('utf-8')
                    id_type = identifier.get('type')
                    if id_type is not None:
                        r_dois.doi_type = id_type.encode('utf-8')
                    if r_dois.doi is not None:
                        # insering records into wos_document_identifier table
                        curs.execute(
                            "INSERT INTO wos_document_identifiers(source_id,document_id,document_id_type,source_filename)VALUES(%s,%s,%s,%s)"\
                                "ON CONFLICT (source_id, document_id_type, document_id) DO UPDATE SET source_id = excluded.source_id,"\
                                "document_id = excluded.document_id,document_id_type = excluded.document_id_type, source_filename ="\
                                "excluded.source_filename, last_updated_time=current_timestamp;",
                            (str(r_dois.source_id), str(r_dois.doi), str(r_dois.doi_type),
                             str(new_pub.source_filename)))


            # parse keyword for each publication
            keywords = REC.find('.//' + url + 'keywords_plus')
            if keywords is not None:
                r_keyword = key_word.wos_keyword()
                r_keyword.source_id = new_pub.source_id
                for keyword in keywords.findall('.//' + url + 'keyword'):
                    if keyword is not None:
                        if keyword.text is not None:
                            r_keyword.keyword = keyword.text.encode('utf-8')
                            # inserting records in wos_keywords
                            curs.execute(
                                "INSERT INTO wos_keywords(source_id,keyword,source_filename)VALUES(%s,%s,%s)ON CONFLICT"\
                                    "(source_id, keyword) DO UPDATE SET source_id = excluded.source_id, keyword = excluded.keyword,"\
                                    "source_filename = excluded.source_filename, last_updated_time=current_timestamp;",
                                (str(r_keyword.source_id), str(r_keyword.keyword),
                                 str(new_pub.source_filename)))


            # parse abstract for each publication
            if new_pub.has_abstract == 'Y':
                abstracts = REC.find('.//' + url + 'abstracts')
                if abstracts is not None:
                    r_abst = abst.abstract()
                    r_abst.source_id = new_pub.source_id
                    r_abstract_text = ''
                    for abstract_text in abstracts.findall('.//' + url + 'p'):
                        if abstract_text is not None:
                            if abstract_text.text is not None:
                                if r_abstract_text != '' and abstract_text.text != '':
                                    r_abstract_text = r_abstract_text.join('\n\n')
                                r_abstract_text = r_abstract_text + abstract_text.text.encode('utf-8')
                    # adding all the abstract paragraphs into one before writing it into the database
                    r_abst.abstract_text = re.sub( r"^[\n]+", "",r_abstract_text)
                    # writing the abstracts record into the data base
                    curs.execute(
                        "INSERT INTO wos_abstracts(source_id,abstract_text,source_filename)VALUES(%s,%s,%s) ON CONFLICT(source_id) DO UPDATE"\
                            " SET source_id = excluded.source_id,abstract_text = excluded.abstract_text,source_filename = excluded.source_filename, last_updated_time=current_timestamp;",
                        (str(r_abst.source_id), str(r_abst.abstract_text), str(new_pub.source_filename)))



            # parse addresses for each publication

            r_addr = add.address()
            addr_no_list = []
            addresses = REC.find('.//' + url + 'addresses')
            for addr in addresses.findall('.//' + url + 'address_spec'):

                addr_ind = addr.get('addr_no')
                if addr_ind is None:
                    addr_ind = 0
                else:
                    addr_ind = int(addr_ind)
                    # Kepp all addr_no for the following reference by authors
                    addr_no_list.append(int(addr_ind))

                r_addr.source_id[addr_ind] = new_pub.source_id
                r_addr.addr_name[addr_ind] = ''
                addr_name = addr.find('.//' + url + 'full_address')
                if addr_name is not None:
                    if addr_name.text is not None:
                        r_addr.addr_name[addr_ind] = addr_name.text.encode('utf-8')
                r_addr.organization[addr_ind] = ''
                organization = addr.find('.//' + url + "organization[@pref='Y']")
                if organization is not None:
                    if organization.text is not None:
                        r_addr.organization[addr_ind] = organization.text. \
                            encode('utf-8')
                r_addr.sub_organization[addr_ind] = ''
                suborganization = addr.find('.//' + url + 'suborganization')
                if suborganization is not None:
                    if suborganization.text is not None:
                        r_addr.sub_organization[addr_ind] = suborganization.text. \
                            encode('utf-8')
                r_addr.city[addr_ind] = ''
                city = addr.find('.//' + url + 'city')
                if city is not None:
                    if city.text is not None:
                        r_addr.city[addr_ind] = city.text.encode('utf-8')
                r_addr.country[addr_ind] = ''
                country = addr.find('.//' + url + 'country')
                if country is not None:
                    if country.text is not None:
                        r_addr.country[addr_ind] = country.text.encode('utf-8')
                r_addr.zip_code[addr_ind] = ''
                addr_zip = addr.find('.//' + url + 'zip')
                if addr_zip is not None:
                    if addr_zip.text is not None:
                        r_addr.zip_code[addr_ind] = addr_zip.text.encode('utf-8')
                if r_addr.addr_name[addr_ind] is not None:
                    # Insering address records into database and retrieving and storing the address_id for future use in authors insertion
                    curs.execute(
                        "INSERT INTO wos_addresses(source_id,address_name,organization,sub_organization,city,country,zip_code,source_filename)"\
                            "VALUES(%s,%s,%s,%s,%s,%s,%s,%s)ON CONFLICT (source_id, address_name) DO UPDATE SET source_id = excluded.source_id,"\
                            "address_name = excluded.address_name,organization = excluded.organization, sub_organization = excluded.sub_organization,"\
                            "city = excluded.city,country = excluded.country, zip_code = excluded.zip_code, source_filename = excluded.source_filename RETURNING id, last_updated_time=current_timestamp;",
                        (str(r_addr.source_id[addr_ind]), str(r_addr.addr_name[addr_ind]),
                         str(r_addr.organization[addr_ind]), str(r_addr.sub_organization[addr_ind]),
                         str(r_addr.city[addr_ind]), str(r_addr.country[addr_ind]), str(r_addr.zip_code[addr_ind]),
                         str(new_pub.source_filename)))
                    r_addr.id[addr_ind] = curs.fetchone()[0]


            # parse titles for each publication
            r_title = ti.title()
            r_title.source_id = new_pub.source_id

            summary = REC.find('.//' + url + 'summary')
            if summary is not None:
                titles = summary.find('.//' + url + 'titles')
                if titles is not None:
                    for title in titles.findall('.//' + url + 'title'):
                        if title is not None:
                            if title.text is not None:
                                r_title.title = title.text.encode('utf-8')
                                r_title.type = title.get('type')
                                # inserting titles into the database
                                curs.execute(
                                    "INSERT INTO wos_titles(source_id,title,type,source_filename)VALUES(%s,%s,%s,%s)ON CONFLICT (source_id, type)"\
                                        "DO UPDATE SET source_id = excluded.source_id, title = excluded.title, type = excluded.type,source_filename ="\
                                        "excluded.source_filename, last_updated_time=current_timestamp;",
                                    (str(r_title.source_id), str(r_title.title), str(r_title.type),
                                     str(new_pub.source_filename)))


            # parse subjects for each publication
            r_subjects = sb.subjects()
            r_subjects.source_id = new_pub.source_id

            subjects = REC.find('.//' + url + 'subjects')
            if subjects is not None:
                for subject in subjects.findall('.//' + url + 'subject'):
                    if subject is not None:
                        if subject.text is not None:
                            r_subjects.subject = subject.text.encode('utf-8')
                            r_subjects.subject_classification_type = subject.get('ascatype')
                            #inserting subjects into the database
                            curs.execute(
                                    "INSERT INTO wos_publication_subjects(source_id,subject_classification_type,subject,source_filename)VALUES(%s,%s,%s,%s)ON CONFLICT (source_id,subject_classification_type,subject)"\
                                        "DO UPDATE SET source_id = excluded.source_id, subject_classification_type = excluded.subject_classification_type, subject = excluded.subject,source_filename ="\
                                        "excluded.source_filename, last_updated_time=current_timestamp;",
                                    (str(r_subjects.source_id), str(r_subjects.subject_classification_type), str(r_subjects.subject),
                                     str(new_pub.source_filename)))


            # parse authors for each publication
            r_author = auth.author()
            r_author.source_id = new_pub.source_id

            summary = REC.find('.//' + url + 'summary')
            names = summary.find(url + 'names')
            for name in names.findall(url + "name[@role='author']"):
                full_name = name.find(url + 'full_name')
                if full_name is not None:
                    if full_name.text is not None:
                        r_author.full_name = full_name.text.encode('utf-8')
                wos_standard = name.find(url + 'wos_standard')
                if wos_standard is not None:
                    if wos_standard.text is not None:
                        r_author.wos_standard = wos_standard.text.encode('utf-8')
                r_author.first_name = ''
                first_name = name.find(url + 'first_name')
                if first_name is not None:
                    if first_name.text is not None:
                        r_author.first_name = first_name.text.encode('utf-8')
                last_name = name.find(url + 'last_name')
                if last_name is not None:
                    if last_name.text is not None:
                        r_author.last_name = last_name.text.encode('utf-8')
                email_addr = name.find(url + 'email_addr')
                if email_addr is not None:
                    if email_addr.text is not None:
                        r_author.email_addr = email_addr.text.encode('utf-8')

                r_author.seq_no = name.get('seq_no')
                r_author.dais_id = name.get('dais_id')
                if (r_author.dais_id == None):
                    r_author.dais_id = ''
                r_author.r_id = name.get('r_id')
                if (r_author.r_id == None):
                    r_author.r_id = ''
                addr_seqs = name.get('addr_no')
                r_author.address_id = ''
                r_author.addr_seq = ''
                if addr_seqs is not None:
                    addr_no_str = addr_seqs.split(' ')
                    for addr_seq in addr_no_str:
                        if addr_seq is not None:
                            addr_index = int(addr_seq)
                            if addr_index in addr_no_list:
                                r_author.address = r_addr.addr_name[addr_index]
                                r_author.address_id = r_addr.id[addr_index]
                                r_author.addr_seq = addr_seq
                                curs.execute(
                                    "INSERT INTO wos_authors(source_id,full_name,last_name,first_name,seq_no,address_seq,address,email_address,address_id,"\
                                        "dais_id,r_id,source_filename)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (source_id, seq_no, address_id)"\
                                        "DO UPDATE SET source_id = excluded.source_id, full_name = excluded.full_name,last_name = excluded.last_name, first_name"\
                                        "= excluded.first_name, seq_no = excluded.seq_no,address_seq = excluded.address_seq, address = excluded.address,"\
                                        "email_address = excluded.email_address,address_id = excluded.address_id, dais_id = excluded.dais_id, r_id ="\
                                        "excluded.r_id,source_filename = excluded.source_filename, last_updated_time=current_timestamp;",
                                    (str(r_author.source_id), str(r_author.full_name),
                                     str(r_author.last_name), str(r_author.first_name), str(r_author.seq_no),
                                     str(r_author.addr_seq), str(r_author.address), str(r_author.email_addr),
                                     str(r_author.address_id), str(r_author.dais_id), str(r_author.r_id),
                                     str(new_pub.source_filename)))

                else:
                    r_author.address_id = 0
                    r_author.addr_seq = 0
                    # inserting records into author tables of database
                    curs.execute(
                        "INSERT INTO wos_authors(source_id,full_name,last_name,first_name,seq_no,email_address,dais_id,r_id,source_filename)"\
                            "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)ON CONFLICT (source_id, seq_no, address_id) DO UPDATE SET source_id ="\
                            "excluded.source_id, full_name = excluded.full_name,last_name = excluded.last_name, first_name ="\
                            "excluded.first_name, seq_no = excluded.seq_no,address_seq = excluded.address_seq, address = excluded.address,"\
                            "email_address = excluded.email_address,address_id = excluded.address_id, dais_id = excluded.dais_id, r_id ="\
                            "excluded.r_id,source_filename = excluded.source_filename, last_updated_time=current_timestamp;",
                        (
                            str(r_author.source_id), str(r_author.full_name), str(r_author.last_name),
                            str(r_author.first_name), str(r_author.seq_no), str(r_author.email_addr),
                            str(r_author.dais_id), str(r_author.r_id), str(new_pub.source_filename)))


            # parse reference data for each publication
            REFERENCES = REC.find('.//' + url + 'references')
            for ref in REFERENCES.findall('.//' + url + 'reference'):
                try:
                    r_reference = reference.reference()
                    r_reference.source_id = new_pub.source_id
                    r_reference.cited_source_uid = None
                    cited_source_id = ref.find('.//' + url + 'uid')
                    if cited_source_id is not None:
                        if cited_source_id.text is not None:
                            r_reference.cited_source_uid = cited_source_id.text. \
                                encode('utf-8')
                    cited_title = ref.find('.//' + url + 'citedTitle')
                    if cited_title is not None:
                        if cited_title.text is not None:
                            r_reference.cited_title = cited_title.text.encode('utf-8')
                    r_reference.cited_work = ''
                    cited_work = ref.find('.//' + url + 'citedWork')
                    if cited_work is not None:
                        if cited_work.text is not None:
                            r_reference.cited_work = cited_work.text.encode('utf-8')
                    cited_author = ref.find('.//' + url + 'citedAuthor')
                    if cited_author is not None:
                        if cited_author.text is not None:
                            r_reference.cited_author = cited_author.text.encode('utf-8')[:299]
                    cited_year = ref.find('.//' + url + 'year')
                    if cited_year is not None:
                        if cited_year.text is not None:
                            r_reference.cited_year = cited_year.text.encode('utf-8')
                    cited_page = ref.find('.//' + url + 'page')
                    if cited_page is not None:
                        if cited_page.text is not None:
                            r_reference.cited_page = cited_page.text.encode('utf-8')

                    r_reference.created_date = new_pub.created_date
                    r_reference.last_modified_date = new_pub.last_modified_date
                    if r_reference.cited_source_uid is not None:
                        # inserting references into database
                        curs.execute(
                            "INSERT INTO wos_references(source_id,cited_source_uid,cited_title,cited_work,cited_author,cited_year,cited_page,"\
                                "created_date,last_modified_date,source_filename)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)ON CONFLICT ON CONSTRAINT"\
                                " wos_references_pk DO UPDATE SET source_id = excluded.source_id, cited_source_uid = excluded.cited_source_uid,"\
                                "cited_title = excluded.cited_title, cited_work = excluded.cited_work, cited_author = excluded.cited_author,"\
                                "cited_year = excluded.cited_year, cited_page = excluded.cited_page, created_date = excluded.created_date,"\
                                "last_modified_date = excluded.last_modified_date, source_filename = excluded.source_filename, last_updated_time=current_timestamp;",
                            (str(r_reference.source_id), str(r_reference.cited_source_uid),
                             str(r_reference.cited_title), str(r_reference.cited_work), str(r_reference.cited_author),
                             str(r_reference.cited_year), str(r_reference.cited_page), str(r_reference.created_date),
                             str(r_reference.last_modified_date), str(new_pub.source_filename)))
                except Exception:
                    print "ERROR occurred for the following reference record:\n", r_reference
                    raise
Esempio n. 15
0
            love_id = re.findall('href="(.*?)喜欢的音乐', html)
            if love_id == []:
                continue
            love_id = re.findall('\d+', love_id[0])
            print(love_id)
            web2.close()
            last_id.append(love_id[0])
        try:
            last_page.click()
        except:
            break
    web.close()
    return last_id


a = author()
a.dizhi = 'http://music.163.com/#/playlist?id=133998351'  #被推荐人网易云音乐主页地址
a.love_music()
print(a.love)
str = 'http://music.163.com/#/playlist?id='
l1 = get_id('http://music.163.com/#/user/follows?id=109538358')  #被推荐人关注的人页面的地址
l1.append(get_id('xxxxxxxxxxxxxx'))  #关注的人页面地址,我一般只用我关注的人的地址
# l1 = ['622800043']
l = []
for i in l1:
    if i not in l:
        l.append(i)
s = []
for i in l:
    b = author()
    b.dizhi = str + i
Esempio n. 16
0
    def do_contributor(self):
        from author import author

        return author()
Esempio n. 17
0
 def do_atom_contributor(self):
     self.metadata()
     from author import author
     return author()
Esempio n. 18
0
    def parse(self, xml_string, counters, input_file_name, curs):
        url = '{http://clarivate.com/schema/wok5.27/public/FullRecord}'

        try:
            root = ET.fromstring(xml_string)
            for REC in root:
                # parse publications and create a publication object containing all the attributes of publication
                new_pub = pub.publication()
                # old method commented
                # r_publication = dict()
                # Couter class to generate surrogate ids temporary for now later they will be replaced by auto incremental collumns in data base

                counters.r_publication_seq += 1
                new_pub.id = counters.r_publication_seq
                # Finding UID in the xml by finding the UID tag inside a record
                new_pub.source_id = REC.find(url + 'UID').text

                pub_info = REC.find('.//' + url + 'pub_info')
                new_pub.source_type = pub_info.get('pubtype')

                source_title = REC.find('.//' + url + "title[@type='source']")

                if source_title is not None:
                    if source_title.text is not None:
                        new_pub.source_title = source_title.text.encode(
                            'utf-8')
                # extracting values from properties of pub_info tag in XMl
                new_pub.has_abstract = pub_info.get('has_abstract')
                new_pub.publication_year = pub_info.get('pubyear')
                new_pub.issue = pub_info.get('issue')
                new_pub.volume = pub_info.get('vol')
                new_pub.pubmonth = pub_info.get('pubmonth')
                new_pub.publication_date = pub_info.get('sortdate')
                new_pub.coverdate = pub_info.get('coverdate')

                page_info = pub_info.find(url + 'page')
                if page_info is not None:
                    new_pub.begin_page = page_info.get('begin')
                    new_pub.end_page = page_info.get('end')

                document_title = REC.find('.//' + url + "title[@type='item']")
                if document_title is not None:
                    if document_title.text is not None:
                        new_pub.document_title = document_title.text. \
                            encode('utf-8')

                document_type = REC.find('.//' + url + 'doctype')
                if document_type is not None:
                    if document_type.text is not None:
                        new_pub.document_type = document_type.text

                publisher_name = REC.find('.//' + url +
                                          "name[@role='publisher']")
                if publisher_name is not None:
                    pub_name = publisher_name.find('.//' + url + 'full_name')
                    if pub_name is not None:
                        if pub_name.text is not None:
                            new_pub.publisher_name = pub_name.text. \
                                encode('utf-8')

                pub_address_no = REC.find('.//' + url +
                                          "address_spec[@addr_no='1']")
                if pub_address_no is not None:
                    publisher_address = pub_address_no.find('.//' + url +
                                                            'full_address')
                    if publisher_address is not None:
                        if publisher_address.text is not None:
                            new_pub.publisher_address = publisher_address.text. \
                                encode('utf-8')

                # r_publication['language'] = ''
                languages = REC.find('.//' + url + 'languages')
                if languages is not None:
                    language = languages.find('.//' + url + 'language')
                    if language is not None:
                        if language.text is not None:
                            new_pub.language = language.text.encode('utf-8')

                new_pub.edition = REC.find('.//' + url +
                                           'edition').get('value')
                new_pub.source_filename = input_file_name
                new_pub.created_date = datetime.date.today()
                new_pub.last_modified_date = datetime.date.today()
                ## query to insert a publication record into the publications table in the database
                ## The query may be written into a saperate file in future from where it is read in the form of a string ammended values and executed to make code look better
                curs.execute(
                    "INSERT INTO wos_publications(begin_page, created_date, document_title, document_type,edition, end_page,has_abstract,id,issue,language,last_modified_date,publication_date,publication_year,publisher_address,publisher_name,source_filename,source_id,source_title,source_type,volume)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (source_id) DO UPDATE SET begin_page = excluded.begin_page, created_date = excluded.created_date,document_title = excluded.document_title, document_type = excluded.document_type, edition = excluded.edition,end_page = excluded.end_page, has_abstract = excluded.has_abstract, id = excluded.id, issue = excluded.issue,language = excluded.language, last_modified_date = excluded.last_modified_date,publication_date = excluded.publication_date, publication_year = excluded.publication_year,publisher_address = excluded.publisher_address, publisher_name = excluded.publisher_name,source_filename = excluded.source_filename, source_id = excluded.source_id, source_title = excluded.source_title,source_type = excluded.source_type, volume = excluded.volume;",
                    (str(new_pub.begin_page), new_pub.created_date,
                     str(new_pub.document_title), str(new_pub.document_type),
                     str(new_pub.edition), str(new_pub.end_page),
                     str(new_pub.has_abstract), str(
                         new_pub.id), str(new_pub.issue), str(
                             new_pub.language), new_pub.last_modified_date,
                     new_pub.publication_date, str(new_pub.publication_year),
                     str(new_pub.publisher_address), str(
                         new_pub.publisher_name), str(new_pub.source_filename),
                     str(new_pub.source_id), str(new_pub.source_title),
                     new_pub.source_type, str(new_pub.volume)))
                ##old code for writing the publications data into a CSV file
                '''writer_pub.writerow((r_publication['id'], r_publication['source_id'], \
                                     r_publication['source_type'], r_publication['source_title'], \
                                     r_publication['language'], r_publication['document_title'], \
                                     r_publication['document_type'], r_publication['has_abstract'], \
                                     r_publication['issue'], r_publication['volume'], \
                                     r_publication['begin_page'], r_publication['end_page'], \
                                     r_publication['publisher_name'], r_publication['publisher_address'], \
                                     r_publication['publication_year'], r_publication['publication_date'], \
                                     r_publication['created_date'], r_publication['last_modified_date'], \
                                     r_publication['edition'], r_publication['source_filename']))'''
                # parse grants in funding acknowledgements for each publication
                # old method of creating a dict type
                # r_grant = dict( )

                # New method of creating an object to store everything in the form of proper objects which could be developed into classes having their own properties in future
                r_grant = grant.grant()
                r_grant.source_id = new_pub.source_id

                # r_grant.funding_ack = ''
                FUNDING_ACK = REC.find('.//' + url + 'fund_text')

                if FUNDING_ACK is not None:  # if funding acknowledgement exists, then extract the r_grant(s) data
                    funding_ack_p = FUNDING_ACK.find('.//' + url + 'p')
                    if funding_ack_p is not None:
                        if funding_ack_p.text is not None:
                            r_grant.funding_ack = funding_ack_p.text.encode(
                                'utf-8')
                # looping through all the r_grant tags
                for l_grant in REC.findall('.//' + url + 'grant'):
                    # r_grant.grant_agency = ''
                    grant_agency = l_grant.find('.//' + url + 'grant_agency')
                    if grant_agency is not None:
                        if grant_agency.text is not None:
                            r_grant.grant_agency = grant_agency.text.encode(
                                'utf-8')

                    grant_ids = l_grant.find('.//' + url + 'grant_ids')
                    if grant_ids is not None:
                        for grant_id in grant_ids.findall('.//' + url +
                                                          'grant_id'):
                            counters.r_grant_seq = counters.r_grant_seq + 1
                            r_grant.id = counters.r_grant_seq
                            # r_grant.grant_number = ''
                            if grant_id is not None:
                                if grant_id.text is not None:
                                    r_grant.grant_number = grant_id.text.encode(
                                        'utf-8')
                            if r_grant.funding_ack is not None:
                                # insert the grant details in the grants table if there is any funding acknowledgement in the records
                                curs.execute(
                                    "INSERT INTO wos_grants(id,source_id,grant_number,grant_organization,funding_ack,source_filename)VALUES(%s,%s,%s,%s,%s,%s) ON CONFLICT (source_id, grant_number, grant_organization) DO UPDATE SET id = excluded.id, source_id = excluded.source_id, grant_number = excluded.grant_number,grant_organization = excluded.grant_organization, funding_ack = excluded.funding_ack,source_filename = excluded.source_filename;",
                                    (str(r_grant.id), str(r_grant.source_id),
                                     str(r_grant.grant_number),
                                     str(r_grant.grant_agency),
                                     str(r_grant.funding_ack),
                                     str(new_pub.source_filename)))
                                '''writer_grant.writerow((r_grant['id'],r_grant['source_id'],\
                                r_grant['grant_number'],r_grant['grant_agency'],\
                                r_grant['funding_ack'],\
                                r_publication['source_filename']))'''
                # insert code to insert record in r_grant table
                # parse document object identifiers for each publication
                r_dois = dois.dois()
                r_dois.source_id = new_pub.source_id

                IDS = REC.find('.//' + url + 'identifiers')
                if IDS is not None:
                    for identifier in IDS.findall('.//' + url + 'identifier'):
                        # r_dois['doi'] = None
                        id_value = identifier.get('value')
                        if id_value is not None:
                            r_dois.doi = id_value.encode('utf-8')
                        # r_dois['doi_type'] = ''
                        id_type = identifier.get('type')
                        if id_type is not None:
                            r_dois.doi_type = id_type.encode('utf-8')
                        # write each doi to CSV file for wos_document_identifiers table
                        if r_dois.doi is not None:
                            counters.r_doi_seq = counters.r_doi_seq + 1
                            r_dois.id = counters.r_doi_seq
                            # insering records into wos_document_identifier table
                            curs.execute(
                                "INSERT INTO wos_document_identifiers(id,source_id,document_id,document_id_type,source_filename)VALUES(%s,%s,%s,%s,%s) ON CONFLICT (source_id, document_id_type, document_id) DO UPDATE SET id = excluded.id, source_id = excluded.source_id, document_id = excluded.document_id,document_id_type = excluded.document_id_type, source_filename = excluded.source_filename;",
                                (str(r_dois.id), str(r_dois.source_id),
                                 str(r_dois.doi), str(r_dois.doi_type),
                                 str(new_pub.source_filename)))
                            '''writer_dois.writerow((r_dois['id'], r_dois['source_id'], \
                                                  r_dois['doi'], r_dois['doi_type'], \
                                                  r_publication['source_filename']))'''

                # parse keyword for each publication
                keywords = REC.find('.//' + url + 'keywords_plus')
                if keywords is not None:
                    r_keyword = key_word.keyword()
                    r_keyword.source_id = new_pub.source_id
                    for keyword in keywords.findall('.//' + url + 'keyword'):
                        if keyword is not None:
                            if keyword.text is not None:
                                r_keyword.keyword = keyword.text.encode(
                                    'utf-8')
                                counters.r_keyword_seq = counters.r_keyword_seq + 1
                                r_keyword.id = counters.r_keyword_seq
                                # inserting records in wos_keywords
                                curs.execute(
                                    "INSERT INTO wos_keywords(id,source_id,keyword,source_filename)VALUES(%s,%s,%s,%s)ON CONFLICT (source_id, keyword) DO UPDATE SET id = excluded.id, source_id = excluded.source_id, keyword = excluded.keyword,source_filename = excluded.source_filename;",
                                    (str(r_keyword.id), str(
                                        r_keyword.source_id),
                                     str(r_keyword.keyword),
                                     str(new_pub.source_filename)))
                                # old code for insering data into a text file
                                ''''writer_keyword.writerow((r_keyword['id'], \
                                                         r_keyword['source_id'], r_keyword['keyword'], \
                                                         r_publication['source_filename']))''' ''

                # parse abstract for each publication
                if new_pub.has_abstract == 'Y':
                    abstracts = REC.find('.//' + url + 'abstracts')
                    if abstracts is not None:
                        r_abst = abst.abstract()
                        r_abst.source_id = new_pub.source_id
                        r_abstract_text = ''
                        for abstract_text in abstracts.findall('.//' + url +
                                                               'p'):
                            if abstract_text is not None:
                                if abstract_text.text is not None:
                                    if r_abstract_text:
                                        r_abstract_text = r_abstract_text.join(
                                            '\n\n')
                                    r_abstract_text = r_abstract_text + abstract_text.text.encode(
                                        'utf-8')
                        # adding all the abstract paragraphs into one before writing it into the database
                        r_abst.abstract_text = r_abstract_text

                        # old code
                        # r_abst['abstract_text'] = abstract_text.text.\
                        #                           encode('utf-8')
                        # r_abstract_seq +=1
                        # r_abst['id'] = r_abstract_seq
                        # writer_abstract.writerow((r_abst['id'],\
                        #     r_abst['source_id'],r_abst['abstract_text'],\
                        #     r_publication['source_filename']))
                        # writing the abstracts record into the data base
                        curs.execute(
                            "INSERT INTO wos_abstracts(source_id,abstract_text,source_filename)VALUES(%s,%s,%s) ON CONFLICT (source_id) DO UPDATE SET source_id = excluded.source_id, abstract_text = excluded.abstract_text, source_filename = excluded.source_filename;;",
                            (str(r_abst.source_id), str(r_abst.abstract_text),
                             str(new_pub.source_filename)))
                        '''writer_abstract.writerow(
                            (r_abst['source_id'], r_abst['abstract_text'], r_publication['source_filename']))'''

                # parse addresses for each publication

                r_addr = add.address()
                # r_addr.id = {}
                # r_addr.source_id = {}
                # r_addr['addr_name'] = {}
                # r_addr['organization'] = {}
                # r_addr['suborganization'] = {}
                # r_addr['city'] = {}
                # r_addr['country'] = {}
                # r_addr['zip'] = {}
                addr_no_list = []
                addresses = REC.find('.//' + url + 'addresses')
                for addr in addresses.findall('.//' + url + 'address_spec'):

                    addr_ind = addr.get('addr_no')
                    if addr_ind is None:
                        addr_ind = 0
                    else:
                        addr_ind = int(addr_ind)
                        # Kepp all addr_no for the following reference by authors
                        addr_no_list.append(int(addr_ind))

                    r_addr.source_id[addr_ind] = new_pub.source_id
                    r_addr.addr_name[addr_ind] = ''
                    addr_name = addr.find('.//' + url + 'full_address')
                    if addr_name is not None:
                        if addr_name.text is not None:
                            r_addr.addr_name[addr_ind] = addr_name.text.encode(
                                'utf-8')
                    r_addr.organization[addr_ind] = ''
                    organization = addr.find('.//' + url +
                                             "organization[@pref='Y']")
                    if organization is not None:
                        if organization.text is not None:
                            r_addr.organization[addr_ind] = organization.text. \
                                encode('utf-8')
                    r_addr.sub_organization[addr_ind] = ''
                    suborganization = addr.find('.//' + url +
                                                'suborganization')
                    if suborganization is not None:
                        if suborganization.text is not None:
                            r_addr.sub_organization[addr_ind] = suborganization.text. \
                                encode('utf-8')
                    r_addr.city[addr_ind] = ''
                    city = addr.find('.//' + url + 'city')
                    if city is not None:
                        if city.text is not None:
                            r_addr.city[addr_ind] = city.text.encode('utf-8')
                    r_addr.country[addr_ind] = ''
                    country = addr.find('.//' + url + 'country')
                    if country is not None:
                        if country.text is not None:
                            r_addr.country[addr_ind] = country.text.encode(
                                'utf-8')
                    r_addr.zip_code[addr_ind] = ''
                    addr_zip = addr.find('.//' + url + 'zip')
                    if addr_zip is not None:
                        if addr_zip.text is not None:
                            r_addr.zip_code[addr_ind] = addr_zip.text.encode(
                                'utf-8')
                    if r_addr.addr_name[addr_ind] is not None:
                        counters.r_addr_seq += 1
                        r_addr.id[addr_ind] = counters.r_addr_seq
                        # Insering address records into database
                        curs.execute(
                            "INSERT INTO wos_addresses(id,source_id,address_name,organization,sub_organization,city,country,zip_code,source_filename)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)ON CONFLICT (source_id, address_name) DO UPDATE SET id = excluded.id, source_id = excluded.source_id, address_name = excluded.address_name,organization = excluded.organization, sub_organization = excluded.sub_organization, city = excluded.city,country = excluded.country, zip_code = excluded.zip_code, source_filename = excluded.source_filename;",
                            (str(r_addr.id[addr_ind]),
                             str(r_addr.source_id[addr_ind]),
                             str(r_addr.addr_name[addr_ind]),
                             str(r_addr.organization[addr_ind]),
                             str(r_addr.sub_organization[addr_ind]),
                             str(r_addr.city[addr_ind]),
                             str(r_addr.country[addr_ind]),
                             str(r_addr.zip_code[addr_ind]),
                             str(new_pub.source_filename)))
                        '''writer_address.writerow((r_addr['id'][addr_ind], \
                                                 r_addr['source_id'][addr_ind], r_addr['addr_name'][addr_ind], \
                                                 r_addr['organization'][addr_ind], \
                                                 r_addr['suborganization'][addr_ind], r_addr['city'][addr_ind], \
                                                 r_addr['country'][addr_ind], r_addr['zip'][addr_ind], \
                                                 r_publication['source_filename']))'''

                # parse titles for each publication
                r_title = ti.title()
                r_title.source_id = new_pub.source_id
                r_title.id = counters.r_title_seq

                summary = REC.find('.//' + url + 'summary')
                if summary is not None:
                    titles = summary.find('.//' + url + 'titles')
                    if titles is not None:
                        for title in titles.findall('.//' + url + 'title'):
                            if title is not None:
                                if title.text is not None:
                                    r_title.title = title.text.encode('utf-8')
                                    r_title.type = title.get('type')
                                    r_title.id += 1
                                    # inserting titles into the database
                                    curs.execute(
                                        "INSERT INTO wos_titles(id,source_id,title,type,source_filename)VALUES(%s,%s,%s,%s,%s)ON CONFLICT (source_id, type) DO UPDATE SET id = excluded.id, source_id = excluded.source_id, title = excluded.title, type = excluded.type,source_filename = excluded.source_filename;",
                                        (str(r_title.id), str(
                                            r_title.source_id),
                                         str(r_title.title), str(r_title.type),
                                         str(new_pub.source_filename)))
                                    '''writer_title.writerow((r_title['id'], \
                                                           r_title['source_id'], r_title['title'], \
                                                           r_title['type'], r_publication['source_filename']))'''

                # parse authors for each publication
                r_author = auth.author()
                r_author.source_id = new_pub.source_id

                summary = REC.find('.//' + url + 'summary')
                names = summary.find(url + 'names')
                for name in names.findall(url + "name[@role='author']"):
                    # for name in REC.findall('.//'+url+"name[@role='author']"):
                    # r_author.full_name = ''
                    full_name = name.find(url + 'full_name')
                    if full_name is not None:
                        if full_name.text is not None:
                            r_author.full_name = full_name.text.encode('utf-8')
                    # r_author['wos_standard'] = ''
                    wos_standard = name.find(url + 'wos_standard')
                    if wos_standard is not None:
                        if wos_standard.text is not None:
                            r_author.wos_standard = wos_standard.text.encode(
                                'utf-8')
                    r_author.first_name = ''
                    first_name = name.find(url + 'first_name')
                    if first_name is not None:
                        if first_name.text is not None:
                            r_author.first_name = first_name.text.encode(
                                'utf-8')
                    # r_author.last_name = ''
                    last_name = name.find(url + 'last_name')
                    if last_name is not None:
                        if last_name.text is not None:
                            r_author.last_name = last_name.text.encode('utf-8')
                    # r_author['email_addr'] = ''
                    email_addr = name.find(url + 'email_addr')
                    if email_addr is not None:
                        if email_addr.text is not None:
                            r_author.email_addr = email_addr.text.encode(
                                'utf-8')

                    r_author.seq_no = name.get('seq_no')
                    r_author.dais_id = name.get('dais_id')
                    r_author.r_id = name.get('r_id')
                    addr_seqs = name.get('addr_no')
                    # r_author.address = ''
                    r_author.address_id = ''
                    r_author.addr_seq = ''
                    if addr_seqs is not None:
                        addr_no_str = addr_seqs.split(' ')
                        for addr_seq in addr_no_str:
                            if addr_seq is not None:
                                addr_index = int(addr_seq)
                                if addr_index in addr_no_list:
                                    r_author.address = r_addr.addr_name[
                                        addr_index]
                                    r_author.address_id = r_addr.id[addr_index]
                                    r_author.addr_seq = addr_seq
                                    counters.r_author_seq += 1
                                    r_author.id = counters.r_author_seq
                                    # inserting records into author table of data base
                                    curs.execute(
                                        "INSERT INTO wos_authors(id,source_id,full_name,last_name,first_name,seq_no,address_seq,address,email_address,address_id,dais_id,r_id,source_filename)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (source_id, seq_no, address_id) DO UPDATE SET id = excluded.id, source_id = excluded.source_id, full_name = excluded.full_name,last_name = excluded.last_name, first_name = excluded.first_name, seq_no = excluded.seq_no,address_seq = excluded.address_seq, address = excluded.address, email_address = excluded.email_address,address_id = excluded.address_id, dais_id = excluded.dais_id, r_id = excluded.r_id,source_filename = excluded.source_filename;",
                                        (str(r_author.id),
                                         str(r_author.source_id),
                                         str(r_author.full_name),
                                         str(r_author.last_name),
                                         str(r_author.first_name),
                                         str(r_author.seq_no),
                                         str(r_author.addr_seq),
                                         str(r_author.address),
                                         str(r_author.email_addr),
                                         str(r_author.address_id),
                                         str(r_author.dais_id),
                                         str(r_author.r_id),
                                         str(new_pub.source_filename)))
                                    '''writer_author.writerow((r_author['id'], \
                                                            r_author['source_id'], r_author['full_name'], \
                                                            r_author['last_name'], r_author['first_name'], \
                                                            r_author['seq_no'], r_author['addr_seq'], \
                                                            r_author['address'], r_author['email_addr'], \
                                                            r_author['address_id'], r_author['dais_id'], \
                                                            r_author['r_id'], r_publication['source_filename']))'''
                    else:
                        counters.r_author_seq += 1
                        r_author.id = counters.r_author_seq
                        r_author.address_id = 0
                        r_author.addr_seq = 0
                        # inserting records into author tables of database
                        curs.execute(
                            "INSERT INTO wos_authors(id,source_id,full_name,last_name,first_name,seq_no,email_address,dais_id,r_id,source_filename)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)ON CONFLICT (source_id, seq_no, address_id) DO UPDATE SET id = excluded.id, source_id = excluded.source_id, full_name = excluded.full_name,last_name = excluded.last_name, first_name = excluded.first_name, seq_no = excluded.seq_no,address_seq = excluded.address_seq, address = excluded.address, email_address = excluded.email_address,address_id = excluded.address_id, dais_id = excluded.dais_id, r_id = excluded.r_id,source_filename = excluded.source_filename;",
                            (str(r_author.id), str(r_author.source_id),
                             str(r_author.full_name), str(r_author.last_name),
                             str(r_author.first_name), str(r_author.seq_no),
                             str(r_author.email_addr), str(r_author.dais_id),
                             str(r_author.r_id), str(new_pub.source_filename)))
                        '''writer_author.writerow((r_author['id'], r_author['source_id'], \
                                                r_author['full_name'], r_author['last_name'], \
                                                r_author['first_name'], r_author['seq_no'], \
                                                r_author['addr_seq'], r_author['address'], \
                                                r_author['email_addr'], r_author['address_id'], \
                                                r_author['dais_id'], r_author['r_id'], \
                                                r_publication['source_filename']))'''

                # parse reference data for each publication
                REFERENCES = REC.find('.//' + url + 'references')
                for ref in REFERENCES.findall('.//' + url + 'reference'):
                    # print "inside reference"
                    r_reference = reference.reference()
                    r_reference.source_id = new_pub.source_id
                    r_reference.cited_source_uid = None
                    cited_source_id = ref.find('.//' + url + 'uid')
                    if cited_source_id is not None:
                        if cited_source_id.text is not None:
                            r_reference.cited_source_uid = cited_source_id.text. \
                                encode('utf-8')
                    # r_reference['cited_title'] = ''
                    cited_title = ref.find('.//' + url + 'citedTitle')
                    if cited_title is not None:
                        if cited_title.text is not None:
                            r_reference.cited_title = cited_title.text.encode(
                                'utf-8')
                    r_reference.cited_work = ''
                    cited_work = ref.find('.//' + url + 'citedWork')
                    if cited_work is not None:
                        if cited_work.text is not None:
                            r_reference.cited_work = cited_work.text.encode(
                                'utf-8')
                    # r_reference['cited_author'] = ''
                    cited_author = ref.find('.//' + url + 'citedAuthor')
                    if cited_author is not None:
                        if cited_author.text is not None:
                            r_reference.cited_author = cited_author.text.encode(
                                'utf-8')[:299]
                    # r_reference['cited_year'] = ''
                    cited_year = ref.find('.//' + url + 'year')
                    if cited_year is not None:
                        if cited_year.text is not None:
                            r_reference.cited_year = cited_year.text.encode(
                                'utf-8')
                    # r_reference.cited_page = ''
                    cited_page = ref.find('.//' + url + 'page')
                    if cited_page is not None:
                        if cited_page.text is not None:
                            r_reference.cited_page = cited_page.text.encode(
                                'utf-8')

                    r_reference.created_date = new_pub.created_date
                    r_reference.last_modified_date = new_pub.last_modified_date
                    if r_reference.cited_source_uid is not None:
                        counters.r_reference_seq = counters.r_reference_seq + 1
                        r_reference.id = counters.r_reference_seq
                        # inserting references into database
                        curs.execute(
                            "INSERT INTO wos_references(wos_reference_id,source_id,cited_source_uid,cited_title,cited_work,cited_author,cited_year,cited_page,created_date,last_modified_date,source_filename)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)ON CONFLICT ON CONSTRAINT wos_references_pk DO UPDATE SET source_id = excluded.source_id, cited_source_uid = excluded.cited_source_uid,cited_title = excluded.cited_title, cited_work = excluded.cited_work, cited_author = excluded.cited_author,cited_year = excluded.cited_year, cited_page = excluded.cited_page, created_date = excluded.created_date,last_modified_date = excluded.last_modified_date, source_filename = excluded.source_filename;",
                            (str(r_reference.id), str(r_reference.source_id),
                             str(r_reference.cited_source_uid),
                             str(r_reference.cited_title),
                             str(r_reference.cited_work),
                             str(r_reference.cited_author),
                             str(r_reference.cited_year),
                             str(r_reference.cited_page),
                             str(r_reference.created_date),
                             str(r_reference.last_modified_date),
                             str(new_pub.source_filename)))
                        '''writer_ref.writerow((r_reference['id'], r_reference['source_id'], \
                                             r_reference['cited_source_id'], r_reference['cited_title'], \
                                             r_reference['cited_work'], r_reference['cited_author'], \
                                             r_reference['cited_year'], r_reference['cited_page'], \
                                             r_reference['created_date'], r_reference['last_modified_date'], \
                                             r_publication['source_filename']))'''
            '''print "Processed", r_publication_seq, "records from", input_csv_dir + input_filename

                                                                                :-4] + "_publication.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_references from '" + xml_csv_dir + input_filename[
                                                                              :-4] + "_reference.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_grants from '" + xml_csv_dir + input_filename[
                                                                          :-4] + "_grant.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_addresses from '" + xml_csv_dir + input_filename[
                                                                             :-4] + "_address.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_authors from '" + xml_csv_dir + input_filename[
                                                                           :-4] + "_author.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_document_identifiers from '" + xml_csv_dir + input_filename[
                                                                                        :-4] + "_dois.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_abstracts from '" + xml_csv_dir + input_filename[
                                                                             :-4] + "_abstract.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_keywords from '" + xml_csv_dir + input_filename[
                                                                            :-4] + "_keyword.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))
            copy_command = "\\copy new_wos_titles from '" + xml_csv_dir + input_filename[
                                                                          :-4] + "_title.csv'" + " delimiter ',' CSV;\n"
            csvfile_load.write((copy_command))

            # Close all opened files
            csvfile_publication.close()
            csvfile_reference.close()
            csvfile_abstract.close()
            csvfile_address.close()
            csvfile_author.close()
            csvfile_dois.close()
            csvfile_grant.close()
            csvfile_keyword.close()
            csvfile_title.close()
            csvfile_load.close()

                #print(rec.find(self.url + 'UID').text)'''

            # print('Database connection closed.')
        except ET.ParseError as error:
            print error
        return (counters)