def GetIndexFromNormaliseFile(filename, index=None):
    if index == None:
        index = IndexBuilder()
    print("Creation de l'index pour le fichier : " + filename)

    lignes = txtFileToListe(filename)
    for ligne in lignes:
        index.AddElem(ligne)
    return index
예제 #2
0
    def process_testing_snaps(self, input_path, url, num_servers):
        dirs = [os.path.join(input_path, f) for f in os.listdir(input_path)
                if os.path.isdir(os.path.join(input_path, f))]

        index_builder = IndexBuilder('model.h5', input_path, 'resources')

        data_frame = pd.DataFrame(columns=['actual', 'precision', 'recall', 'is_first'])

        '''store all fingerprints and vectors as tuples in a list '''
        for dir in dirs:
            actual = os.path.split(dir)[-1]
            print(actual)
            df = self._run(index_builder, dir, url, num_servers)

            if df is not None:
                df['actual'] = actual

                precision = 1 if df[df['predicted'] == actual]['predicted'].count() > 0 else 0
                recall = precision
                top = df.loc[df['score'].idxmax()]
                is_first = 1 if top['predicted'] == actual else 0

                data_frame = data_frame.append(pd.DataFrame(data=[[actual, precision, recall, is_first]],
                                                            columns=['actual', 'precision', 'recall', 'is_first']))

        return data_frame
예제 #3
0
    def test_run(self, input_path, url, num_servers, ):
        index_builder = IndexBuilder('model.h5', input_path, 'resources')

        actual = input_path.split()[-1]
        df = self._run(index_builder, input_path, url, num_servers)
        df['actual'] = actual

        return df
def getIndexFromTweet(index=None):
    if index == None:
        index = IndexBuilder()

    fileName = "Normalisation/70kTweet.txt"
    index = GetIndexFromNormaliseFile(fileName, index)

    return index
def getIndexFromDonneeTest(index=None):
    if index == None:
        index = IndexBuilder()

    fileName = "Normalisation/texteDT.txt"
    index = GetIndexFromNormaliseFile(fileName, index)

    return index
def getIndexFromTP(index=None):
    if index == None:
        index = IndexBuilder()

    fileName = "Normalisation/SVMtrain.txt"
    index = GetIndexFromNormaliseFile(fileName, index)
    fileName = "Normalisation/SVMtest.txt"
    index = GetIndexFromNormaliseFile(fileName, index)

    return index
 def __init__(self):
     self.titleFound = False
     self.idFound = False
     self.textFound = False
     self.docId = ""
     self.title = ""
     self.text = ""
     self.indexFileNumber = 0
     self.offset = 0
     self.outputDir = ""
     self.index = {}
     self.titleTermFreq = {}
     self.infoBoxTermFreq = {}
     self.bodyTermFreq = {}
     self.categoryTermFreq = {}
     self.externalLinkTermFreq = {}
     self.referenceTermFreq = {}
     self.docIdCounter = 0
     self.docIdTitleMapping = {}
     self.textProcessor = TextProcessor()
     self.indexBuilder = IndexBuilder()
     self.docIdToTextMapping = {}
def EnleverMotVideSpecifique(fichier):
    pathFile = "Normalisation/" + fichier + ".txt"
    index = IndexBuilder()
    lignes = txtFileToListe(pathFile, withSpaceTreatment=True)
    for ligne in lignes:
        for mot in ligne.split():
            index.AddElem(mot)
    nbrOccurences = index.GetNombreOccurence()
    pc10 = int(float(len(lignes)) / 100 * 10)
    motVideSpecifique = {}
    for nbrOccurence in nbrOccurences:
        occurence = nbrOccurence[0]
        nbr = nbrOccurence[1][0]
        if nbr > pc10:
            motVideSpecifique[occurence] = 0
    res = ""
    for ligne in lignes:
        for mot in ligne.split():
            if not motVideSpecifique.has_key(mot):
                res += mot + " "
        res += "\n"
    print(motVideSpecifique)
    WriteInFile(pathFile, res)
예제 #9
0
    def run_local(self):
        test_files = [f for f in os.listdir(self.test_snap_out_path)
                      if os.path.isdir(os.path.join(self.test_snap_out_path, f))]
    
        all_results = []
        actuals = []
        
        base_url = 'data'
        index = IndexBuilder('model.h5', base_url + '\snapshots', 'resources')

        pattern = re.compile('.*\.jpg')
        for t in test_files:
            print(t)
            actuals.append(t)
            filepath = os.path.join(self.test_snap_out_path, t)
            images = [os.path.join(filepath, f) for f in os.listdir(filepath)
                      if os.path.isfile(os.path.join(filepath, f)) and pattern.findall(f)]

            results = []
            for img in images:
                fp, query_vec = index.finger_print(img)
                if str(fp) in self._master:
                    vec_list = self._master[str(fp)]
                    for ans_vec in vec_list:
                        c = self._cosine_similarity(query_vec, ans_vec[1])
                        results.append([ans_vec[0].split('_')[0], c])
                
            all_results.append(results)

        for ind, test in enumerate(test_files):
            if len(all_results[ind]) > 0:
                df = pd.DataFrame(all_results[ind], columns=["video_id", "cosine"]).fillna(0)
                df_new = df.groupby("video_id")["cosine"].agg(['mean', 'count']).reset_index()
                
                for i in range(df_new.shape[0]):
                    df_new.ix[i,"prediction"] = self._video_dict[df_new.ix[i,"video_id"]]
                    df_new.ix[i,"score"] = df_new.ix[i,"mean"]*(df_new.ix[i,"count"])
                df_new["Actual"] = test
                df_new.sort_values(by="score",inplace=True,ascending=False)
                self.generate_output_path('./data/test_answers')
                df_new.to_csv(os.path.join("data/test_answers",test+".csv"),index=False)
                

        with open('./resources/urls.json','r') as f:
            url_dict=json.load(f)
        
        final_df = pd.DataFrame(columns=['actual','predicted','precision','recall','accuracy','ndcg','urls'])
        for file in os.listdir("data/test_answers/"):
            if fnmatch.fnmatch(os.path.join('data/test_answers',file), '*.csv') and 'final.csv' not in file:
                print("File name is ",file)
                df = pd.read_csv(os.path.join('data/test_answers',file))
                for i in range(min(10,df.shape[0])):
                    if df.ix[i,"prediction"] == df.ix[i,"Actual"]:
                        precision = 1
                        recall = 1
                        ndcg = 1/(1 if i is 0 else math.log(i+1,2))
                        break
                    else:
                        precision = 0
                        recall = 0
                        ndcg = 0
                
                if df.ix[0,"prediction"] == df.ix[0,"Actual"]:
                    accuracy = 1
                else:
                    accuracy = 0                   
            
            predicted=df.ix[0,"prediction"]
            #url_regex=":|;><#$%,.'\"\\\""
            #pred=''.join(e for e in predicted if e not in url_regex)
            url=url_dict.get(predicted,None)
            
            if url is None:
                print(f'Predicted Name {predicted}')
            final_df = final_df.append(pd.DataFrame(data=[[df.ix[0,"Actual"],df.ix[0,"prediction"], precision, recall, accuracy,ndcg,url]],
                                                    columns=['actual','predicted', 'precision', 'recall', 'accuracy','ndcg','urls']))
        final_df.to_csv(os.path.join('data/test_answers',"final.csv"),index=False)

        return final_df
class DataHandler(xml.sax.handler.ContentHandler):
    '''
        SAX Parser
    '''
    flag = False

    def __init__(self):
        self.titleFound = False
        self.idFound = False
        self.textFound = False
        self.docId = ""
        self.title = ""
        self.text = ""
        self.indexFileNumber = 0
        self.offset = 0
        self.outputDir = ""
        self.index = {}
        self.titleTermFreq = {}
        self.infoBoxTermFreq = {}
        self.bodyTermFreq = {}
        self.categoryTermFreq = {}
        self.externalLinkTermFreq = {}
        self.referenceTermFreq = {}
        self.docIdCounter = 0
        self.docIdTitleMapping = {}
        self.textProcessor = TextProcessor()
        self.indexBuilder = IndexBuilder()
        self.docIdToTextMapping = {}

    def startElement(self, name, attrs):
        '''
        Invoked when a start of a tag is seen by SAX API
        '''
        if name == "id" and not self.idFound:
            self.docId = ""
            self.idFound = True

        if name == "title" and not self.titleFound:
            self.title = ""
            self.titleFound = True

        if name == "text" and not self.textFound:
            self.text = ""
            self.textFound = True

    def characters(self, content):
        '''
        Invoked when characters inside a tag are found
        '''
        if self.idFound:
            self.docId += content

        elif self.titleFound:
            self.title += content
            self.docIdTitleMapping[self.docIdCounter] = content

        elif self.textFound:
            self.text += content

    def endElement(self, name):
        '''
        Invoked when a tag end is encountered
        '''
        if name == "id":
            self.idFound = False
        elif name == "title":
            self.titleFound = False
            #self.titleTermFreq = self.textProcessor.processTitle(self.title)

        elif name == "text":
            self.textFound = False
            self.text = self.text.lower()
            self.docIdToTextMapping[self.docIdCounter] = self.text
            self.docIdCounter += 1
            if (self.docIdCounter % 5000 == 0):
                processedTitleBulk = self.textProcessor.processTitleBulk(
                    self.docIdTitleMapping)
                processedTextBulk = self.textProcessor.processTextBulk(
                    self.docIdToTextMapping)
                self.indexBuilder.buildIndexBulk(processedTextBulk,
                                                 processedTitleBulk,
                                                 self.docIdTitleMapping)
                print("Total docs parsed: ", self.docIdCounter)
                self.docIdToTextMapping = {}
                self.docIdTitleMapping = {}

        elif name == "page":
            DataHandler.flag = False
예제 #11
0
 def runIndexBuilder(self, is_stopping, is_stemmed_corpus):
     n = 1
     builder = IndexBuilder(False, is_stopping, is_stemmed_corpus)
     if (is_stemmed_corpus):
         builder.output_files_location = self.stemmed_index_folder
     if (is_stopping):
         builder.output_files_location = self.stopped_index_folder
     if (not (is_stemmed_corpus) and not (is_stopping)):
         builder.output_files_location = self.index_folder
     builder.build_inverted_index(n)
     builder.generate_tf_table()
     builder.generate_df_table()
예제 #12
0
파일: Main.py 프로젝트: aparamshetti/ViFi
formatter = logging.Formatter('%(asctime)s:%(filename)s:%(message)s')
file_handler = logging.FileHandler(f'./logs/{file_name}')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)


class Main:
    def __init__(self):
        self.list_of_required_folders = ['data', 'logs', 'master', 'resources']
        self.base_path = os.path.dirname(os.path.realpath(__file__)).replace(
            "\\", "/")

    def build_folder_structure(self):
        for folder in self.list_of_required_folders:
            if not os.path.exists(f'{self.base_path}/{folder}'):
                os.mkdir(f'{self.base_path}/{folder}')
                logger.info(f"Creating folder : {folder}")

    def generate_dataset(self):
        logger.info('Downloading videos')
        YoutubeDownloader.main()
        logger.info('Capturing Snapshots')
        CaptureSnapshots.main()


if __name__ == '__main__':
    _main = Main()
    _main.build_folder_structure()
    _main.generate_dataset()
    IndexBuilder.main()