Example #1
0
class ReadData():
    """Store the data into redis"""
    def __init__(self):
        self.redis = RedisHelper()
        self.title = str()
        self.authors = list()
        self.year = 1970
        self.venue = str()
        self.paperID = 0
        self.references = list()
        self.abstract = str()
        self.authorCoauthor = dict()
        self.count = 0
        logging.info('starting.....')

    def clearVar(self):
        self.title = ""
        self.venue = ""
        self.abstract = ""
        self.authors = []
        self.references = []
        self.authorCoauthor = {}
        self.year = 1970

    def read(self):
        with open(PATH_DATASET_TXT) as fileReader:
            logging.info('reading.....')
            for line in fileReader:
                if line[0] != '#':
                    self.count += 1
                    if self.count % 10000 == 0:
                        logging.info(self.count)
                    self.save2Redis()
                    self.clearVar()
                elif line[1] == '*': # title
                    self.title = line.strip('\n\r')[2:]
                elif line[1] == '@': # authors
                    self.authors.extend(line.strip('\n\r')[2:].split(','))
                elif line[1] == 't': # year
                    self.year = line.strip()[2:]
                elif line[1] == 'c': # venue
                    self.venue = line.strip('\n\r')[2:]
                elif line[1] == 'i': # paperID
                    self.paperID = line.strip()[6:]
                elif line[1] == '%': # references
                    self.references.append(line.strip()[2:])
                elif line[1] == '!': # abstract
                    self.abstract = line.strip('\n\r')[2:]

    def save2Redis(self):
        if re.search(rewords, self.venue):
            if self.paperID != '' and self.year != '':
                if int(self.year) >= PAPER_START_YEAR:
                    self.redis.addPaperYear(self.paperID, self.year)
                    self.redis.addPaperVenue(self.paperID, self.venue)
                    self.redis.addPaperTitle(self.paperID, self.title)
                    self.redis.addPaperAbstract(self.paperID, self.abstract)
                    if self.references[0] != '':
                        for reference in self.references:
                            self.redis.addPaperReferences(self.paperID, reference)
                            self.redis.addPaperRefered(reference, self.paperID)
                    for author in self.authors:
                        self.redis.addAuthorPapers(author, self.paperID)
                        self.redis.addPaperAuthors(self.paperID, author)
                    if len(self.authors) > 1:
                        for i in range(len(self.authors)):
                            self.redis.addAuthorPapers(self.authors[i], self.paperID)
                            for j in range(i + 1, len(self.authors)):
                                self.redis.addAuthorCoauthor(self.authors[i], self.authors[j])
                                self.redis.addAuthorCoauthor(self.authors[j], self.authors[i])